def registerExecutables(self, workflow=None): """ 2012.2.15 """ AbstractWorkflow.registerExecutables(self) namespace = self.namespace version = self.version operatingSystem = self.operatingSystem architecture = self.architecture clusters_size = self.clusters_size site_handler = self.site_handler variationSrcPath = self.variationSrcPath vervetSrcPath = self.vervetSrcPath #2012.8.7 each cell is a tuple of (executable, clusterSizeMultipler (0 if u do not need clustering) executableClusterSizeMultiplierList = [] Stock_250kDB = Executable(namespace=namespace, name="Stock_250kDB", version=version, \ os=operatingSystem, arch=architecture, installed=True) Stock_250kDB.addPFN(PFN("file://" + os.path.join(self.variationSrcPath, "db/Stock_250kDB.py"), site_handler)) executableClusterSizeMultiplierList.append((Stock_250kDB, 0)) self.addExecutableAndAssignProperClusterSize(executableClusterSizeMultiplierList, defaultClustersSize=self.clusters_size)
def getTopSNPTestType(self, get_closest, min_MAF, allow_two_sample_overlapping, results_type,\ test_type_id, null_distribution_type_id): """ 2008-10-30 null_distribution_type_id in CandidateGeneTopSNPTestRMType doesn't matter anymore. set it to 1 2008-10-26 min_distance is removed from CandidateGeneTopSNPTestRMType. 2008-10-16 check whcih TopSNPTest type this is, create one if it doesn't exist in db """ if self.debug: sys.stderr.write("Getting CandidateGeneTopSNPTestRMType ...") rows = Stock_250kDB.CandidateGeneTopSNPTestRMType.query.\ filter_by(get_closest =get_closest).\ filter(Stock_250kDB.CandidateGeneTopSNPTestRMType.min_MAF>=min_MAF-0.0001).filter(Stock_250kDB.CandidateGeneTopSNPTestRMType.min_MAF<=min_MAF+0.0001).\ filter_by(allow_two_sample_overlapping = allow_two_sample_overlapping).filter_by(results_type=results_type).\ filter_by(test_type_id=test_type_id).\ filter_by(null_distribution_type_id=null_distribution_type_id) if rows.count()>0: _type = rows.first() else: _type = Stock_250kDB.CandidateGeneTopSNPTestRMType(get_closest =get_closest,\ min_MAF = min_MAF,\ allow_two_sample_overlapping = allow_two_sample_overlapping, \ results_type=results_type,\ test_type_id=test_type_id,\ null_distribution_type_id=null_distribution_type_id) if self.debug: sys.stderr.write("Done.\n") return _type
def submit_to_call_QC(cls, session, row_id2NA_mismatch_rate, QC_method_id, user, min_probability, row_id12row_id2, call_method_id, readme): """ 2008-05-21 ecotype_id, call_info_id = row_id #bug here, order changed. 2008-05-19 NA_mismatch_ls was expanded 2008-05-06 add readme 2008-05-05 add ecotype_id, min_probability, tg_ecotype_id """ sys.stderr.write("Submitting row_id2NA_mismatch_rate to database ...") row_id_ls = row_id2NA_mismatch_rate.keys() row_id_ls.sort() #try to keep them in call_info_id order for row_id in row_id_ls: NA_mismatch_ls = row_id2NA_mismatch_rate[row_id] ecotype_id, call_info_id = row_id #bug here, order changed. tg_ecotype_id = row_id12row_id2[row_id] na_rate, mismatch_rate, no_of_nas, no_of_totals, no_of_mismatches, no_of_non_na_pairs, relative_NA_rate, relative_no_of_NAs, relative_no_of_totals = NA_mismatch_ls #call_QC stores the relative NA rate. call_info already stores the independent NA rate na_rate, no_of_nas, no_of_totals = relative_NA_rate, relative_no_of_NAs, relative_no_of_totals callqc = Stock_250kDB.CallQC(call_info_id=call_info_id, min_probability=min_probability, ecotype_id=ecotype_id, tg_ecotype_id=tg_ecotype_id,\ qc_method_id=QC_method_id, call_method_id=call_method_id, na_rate=na_rate, mismatch_rate=mismatch_rate,\ no_of_nas=no_of_nas, no_of_totals=no_of_totals, no_of_mismatches=no_of_mismatches, no_of_non_na_pairs=no_of_non_na_pairs,\ created_by=user) callqc.readme = readme session.add(callqc) """ data_insert_ls = [row_id[0]] + NA_mismatch_ls + [QC_method_id, user] #row_id is (call_info_id, ecotypeid) curs.execute("insert into " + call_QC_table + " (call_info_id, na_rate, mismatch_rate, no_of_nas, no_of_totals, no_of_mismatches, no_of_non_NA_pairs, QC_method_id, created_by)\ values(%s, %s, %s, %s, %s, %s, %s, %s, %s)", data_insert_ls) """ sys.stderr.write("Done.\n")
def connectDB(self): """ 2012.11.18 """ db_250k = Stock_250kDB.Stock_250kDB(drivername=self.drivername, db_user=self.db_user, db_passwd=self.db_passwd, \ hostname=self.hostname, dbname=self.dbname, schema=self.schema, port=self.port) db_250k.setup(create_tables=False) self.db_250k = db_250k
def connectDB(self): """ 2012.6.5 overwrite the parent class """ self.db_250k = Stock_250kDB.Stock_250kDB(drivername=self.drivername, db_user=self.db_user, db_passwd=self.db_passwd, hostname=self.hostname, dbname=self.dbname, schema=self.schema,\ port=self.port) self.db_250k.setup(create_tables=False)
def getThresholdType(self, r1_pvalue_cutoff, r2_pvalue_cutoff): """ 2009-5-2 """ threshold_type = Stock_250kDB.CmpEnrichmentOfTwoAnalysisMethodsType.query.filter_by(r1_threshold=r1_pvalue_cutoff).\ filter_by(r2_threshold=r2_pvalue_cutoff).first() if not threshold_type: threshold_type = Stock_250kDB.CmpEnrichmentOfTwoAnalysisMethodsType( r1_threshold=r1_pvalue_cutoff, r2_threshold=r2_pvalue_cutoff) return threshold_type
def run(self): """ 2012.3.2 """ if self.debug: import pdb pdb.set_trace() db_250k = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, \ hostname=self.hostname, database=self.dbname) db_250k.setup(create_tables=False) # Create a abstract dag workflowName = os.path.splitext(os.path.basename(self.outputFname))[0] workflow = self.initiateWorkflow(workflowName) self.registerExecutables(workflow) self.registerCustomExecutables(workflow) #find all hdf5 correlation files inputFnameLs = self.getFilesWithProperSuffixFromFolder( self.inputFolder, suffix='.h5') inputData = self.registerAllInputFiles(workflow, inputFnameLs=inputFnameLs, input_site_handler=self.input_site_handler, \ pegasusFolderName=self.pegasusFolderName) #organize final output plots by biology_category, biology_category_id2outputfolder sameCategoryPhenotypeMethodLs = db_250k.getPhenotypeMethodLsGivenBiologyCategoryID( self.biology_category_id, access=self.access) sameCategoryPhenotypeMethodIDLs = [ pm.id for pm in sameCategoryPhenotypeMethodLs ] phenotype_method_id_ls = self.phenotype_method_id_ls + sameCategoryPhenotypeMethodIDLs result_list = db_250k.getResultLs(call_method_id=self.call_method_id, analysis_method_id_ls=self.analysis_method_id_ls, \ phenotype_method_id_ls=phenotype_method_id_ls, cnv_method_id=self.cnv_method_id) result_id_ls = [result.id for result in result_list] sys.stderr.write("%s results.\n" % (len(result_id_ls))) result_peak_ls = db_250k.getResultPeakList(result_id_ls=result_id_ls, \ result_peak_type_id=self.result_peak_type_id) self.addJobs(workflow, result_peak_ls=result_peak_ls, inputData=inputData, datasetName=self.datasetName, chunkSize=self.chunkSize, \ pegasusFolderName=self.pegasusFolderName) # Write the DAX to stdout outf = open(self.outputFname, 'w') workflow.writeXML(outf)
def run(self): """ 2008-05-17 -check_method_id_exists() if input_dir is dir: -submit_call_dir2db() -get_new_call_id() -get_cur_max_call_id() elif input_dir is file: -submit_call_file2db() """ if self.debug: import pdb pdb.set_trace() #database connection and etc db = self.db_250k session = db.session session.begin() chr_pos2db_id = db.getSNPChrPos2ID() #import MySQLdb #conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user = self.user, passwd = self.passwd) #curs = conn.cursor() curs = None if not self.check_method_id_exists(db, self.method_id): sys.stderr.write("Warning: method_id=%s not in %s. A new entry to be created.\n"%\ (self.method_id, self.call_method_table)) cm = Stock_250kDB.CallMethod( short_name=self.call_method_short_name, id=self.method_id) session.add(cm) session.flush() self.method_id = cm.id if self.commit: self.submit_call2db(curs, self.input_dir, self.call_info_table, self.output_dir, self.method_id, self.db_user, \ chr_pos2db_id=chr_pos2db_id, db=db) #curs.execute("commit") session.flush() session.commit()
def getHistType(cls, call_method_id, min_distance, get_closest, min_MAF, allow_two_sample_overlapping, results_type,\ null_distribution_type_id): """ 2008-11-08 become a classmethod 2008-10-16 """ sys.stderr.write("Getting ScoreRankHistogramType ...") rows = Stock_250kDB.ScoreRankHistogramType.query.filter_by(call_method_id=call_method_id).\ filter_by(min_distance=min_distance).filter_by(get_closest =get_closest).\ filter(Stock_250kDB.ScoreRankHistogramType.min_MAF>=min_MAF-0.0001).filter(Stock_250kDB.ScoreRankHistogramType.min_MAF<=min_MAF+0.0001).\ filter_by(allow_two_sample_overlapping = allow_two_sample_overlapping).filter_by(results_type=results_type).\ filter_by(null_distribution_type_id=null_distribution_type_id) if rows.count() > 0: hist_type = rows.first() else: hist_type = Stock_250kDB.ScoreRankHistogramType(call_method_id=call_method_id, min_distance=min_distance,\ get_closest =get_closest, min_MAF = min_MAF, results_type=results_type, allow_two_sample_overlapping = allow_two_sample_overlapping, \ null_distribution_type_id=null_distribution_type_id) sys.stderr.write("Done.\n") return hist_type
def runEnrichmentTestToGetNullData(self, session, pd): """ 2008-11-04 stop checking TopSNPTestRMNullData if same run_no exists. the chance is small, however, this incurs a huge load on db server. 2008-11-05 return result and null_data to be sent over to output node to save them in batch in MpiTopSNPTest.py (output node connects to the master db.) 2008-10-30 run enrichment test, also to get NULL data based on either null distribution """ if self.debug: sys.stderr.write("Running Enrichment test on results_id=%s, list_type_id=%s, no_of_top_snps=%s, no_of_top_snps_ls=%s, \ type_id=%s, min_score=%s, ... "%\ (getattr(pd, 'results_id',-1), getattr(pd, 'list_type_id', -1), getattr(pd, 'no_of_top_snps', -1),\ repr(getattr(pd, 'no_of_top_snps_ls', -1)), getattr(pd, 'type_id', -1), getattr(pd, 'min_score', -1))) ResultsClass = Stock_250kDB.ResultsMethod TestResultClass = Stock_250kDB.CandidateGeneTopSNPTestRM rm = ResultsClass.get(pd.results_id) min_distance = pd.min_distance min_MAF = pd.min_MAF get_closest = pd.get_closest no_of_top_snps_ls = getattr(pd, 'no_of_top_snps_ls', []) min_score_ls = getattr(pd, 'min_score_ls', []) if no_of_top_snps_ls: cutoff_ls = no_of_top_snps_ls cutoff_type = 1 else: cutoff_ls = min_score_ls cutoff_type = 2 commit = getattr(pd, 'commit', 0) #2008-10-30 save objects right away if not rm: sys.stderr.write("No results available for results_id=%s.\n"%pd.results_id) return None candidate_gene_set = self.dealWithCandidateGeneList(pd.list_type_id, return_set=True) #internal cache no_of_candidate_genes = len(candidate_gene_set) no_of_total_snps = None #same for all tests from the same rm null_data_ls = [] result_ls = [] if pd.null_distribution_type_id==2 or pd.null_distribution_type_id==3: pd.need_permData = 1 #need to permData in getTestResult() even when the result is directly found in the database for i in range(pd.no_of_permutations): if pd.null_distribution_type_id==2: if no_of_total_snps is None: #need to get it from the file if cutoff_type==1: pd.no_of_top_snps = cutoff_ls[0] elif cutoff_type==2: pd.min_score = cutoff_ls[0] return_data = self.getTestResult(session, rm, TestResultClass, pd) if return_data.permData: #permData could be None no_of_total_snps = return_data.permData.no_of_total_snps else: if self.debug: sys.stderr.write("Warning: No permData from getTestResult(). aborted.\n") break shift = random.randint(1, no_of_total_snps) run_no = shift #use this to link all NULL data under different no_of_top_snps together else: if no_of_candidate_genes>len(pd.total_gene_id_ls): if self.debug: sys.stderr.write("no_of_candidate_genes %s is bigger than no_of_total_genes, %s.\n"%\ (no_of_candidate_genes, len(pd.total_gene_id_ls))) break random_candidate_gene_ls = random.sample(pd.total_gene_id_ls, no_of_candidate_genes) run_no = sum(random_candidate_gene_ls)%1000000 #take the sum of all gene ids and modulo to make it under 1 million. very little chance any two random gene lists have identical this number. random_candidate_gene_set = Set(random_candidate_gene_ls) random_candidate_gene_snp_gw_index_ls = None #set it to None before every permutation for cutoff in cutoff_ls: if cutoff_type==1: pd.no_of_top_snps = cutoff elif cutoff_type==2: pd.min_score = cutoff return_data = self.getTestResult(session, rm, TestResultClass, pd) result = return_data.result permData = return_data.permData if result: if result.id is None: #need to return this to save later result_ls.append(result) """ #2008-11-04 doesn't care repeating run_no. the chance is small, however, this incurs a huge load on db server. rows = Stock_250kDB.TopSNPTestRMNullData.query.\ filter_by(observed_id=result.id).\ filter_by(run_no=run_no).\ filter_by(null_distribution_type_id=pd.null_distribution_type_id) if rows.count()>0: if self.debug: sys.stderr.write("null data for observed_id=%s, run_no=%s, null_distribution_type_id=%s already in db.\n"%\ (result.id, run_no, pd.null_distribution_type_id)) continue """ if len(result.null_data_ls)>pd.no_of_permutations: #skip if it's too many already continue if pd.null_distribution_type_id==2: #indices of the top SNPs above a certain cutoff top_snp_index_ls = numpy.hstack((permData.candidate_gene_snp_index_ls, permData.non_candidate_gene_snp_index_ls)) #get corresponding (chr,pos)s of the top SNPs after they are shifted. looped_chr_pos_ls = self.get_looped_chr_pos_ls(top_snp_index_ls, permData.no_of_total_snps, permData.total_chr_pos_ar, \ shift=shift) #after shifting (permutation), how many are close to candidate genes looped_candidate_gene_snp_index_ls = self.get_candidate_gene_snp_index_ls(candidate_gene_set, \ looped_chr_pos_ls, \ pd.snps_context_wrapper) new_candidate_sample_size = len(looped_candidate_gene_snp_index_ls) new_candidate_gw_size = result.candidate_gw_size #same as observed else: top_snp_chr_pos_ls = permData.candidate_gene_snp_chr_pos_ls + permData.non_candidate_gene_snp_chr_pos_ls if random_candidate_gene_snp_gw_index_ls is None: #2008-10-31 if it's None, generate it. same for every simulation random_candidate_gene_snp_gw_index_ls = self.get_candidate_gene_snp_index_ls(random_candidate_gene_set,\ permData.total_chr_pos_ar, pd.snps_context_wrapper) random_candidate_gene_snp_sample_index_ls = self.get_candidate_gene_snp_index_ls(random_candidate_gene_set, \ top_snp_chr_pos_ls, pd.snps_context_wrapper) new_candidate_sample_size = len(random_candidate_gene_snp_sample_index_ls) new_candidate_gw_size = len(random_candidate_gene_snp_gw_index_ls) null_data = Stock_250kDB.TopSNPTestRMNullData(observed=result,\ candidate_sample_size=new_candidate_sample_size,\ candidate_gw_size=new_candidate_gw_size,\ run_no=run_no,\ null_distribution_type_id=pd.null_distribution_type_id) null_data_ls.append(null_data) session.save(null_data) #put in the session cache if commit: session.flush() elif pd.null_distribution_type_id==1: for cutoff in cutoff_ls: if cutoff_type==1: pd.no_of_top_snps = cutoff elif cutoff_type==2: pd.min_score = cutoff return_data = self.getTestResult(session, rm, TestResultClass, pd) if return_data.result and return_data.result.id is None: result_ls.append(return_data.result) else: sys.stderr.write("null_distribution_type %s not supported.\n"%(pd.null_distribution_type_id)) return None if self.debug: sys.stderr.write("Done.\n") return_data = PassingData(result_ls=result_ls, null_data_ls=null_data_ls) return return_data
def add2DB(self, db=None, short_name=None, phenotype_method_id=None, call_method_id=None, data_description=None, \ method_description=None, comment=None, inputFname=None, user=None, results_method_type_id=None, \ analysis_method_id=None, results_method_type_short_name=None, data_dir=None, commit=0,\ cnv_method_id=None): """ 2012.12.28 overhaul 2012.6.6 pass db to getOneResultJsonData() 2012.3.9 add locus_type_id to ResultsMethod 2011-2-22 add argument cnv_method_id deal with the association file format change. locus is now identified by Snps.id or CNV.id 2010-5-3 becomes classmethod store the json structure of top 10000 SNPs from the db_entry into db 2008-09-30 don't save results_method into database if bad thing happend when getting data out of the file. 2008-09-09 directly copy the result file if analysis_method_id==13 2008-08-19 automatically generate short_name if it's NULL 2008-07-16 adjust to new Elixir-based db api. new analysis_method_id is added to results_method. 2008-05-30 go to output_dir drop copyAndReformatResultFile() use store_file() 2008-05-26 add results_method_type_id and results_method_type_short_name 2008-05-24 to conveniently wrap up all codes so that both this program and plone can call """ session = db.session session.begin() rmt = Stock_250kDB.ResultsMethodType.get(results_method_type_id) if not rmt and results_method_type_short_name is not None: #create a new results method type rmt = Stock_250kDB.ResultsMethodType(short_name=results_method_type_short_name) session.add(rmt) if not rmt: sys.stderr.write("No results method type available for results_method_type_id=%s.\n"%results_method_type_id) sys.exit(3) if call_method_id: #2012.6.6 cm = Stock_250kDB.CallMethod.query.get(call_method_id) locus_type_id = cm.locus_type_id else: cm = None locus_type_id = None db_entry = db.checkResultsMethod(call_method_id=call_method_id, phenotype_method_id=phenotype_method_id, \ analysis_method_id=analysis_method_id, \ cnv_method_id=cnv_method_id, accession_set_id=None, results_method_type_id=results_method_type_id) if db_entry: sys.stderr.write("There is already an entry in results_method (id=%s) with same (call_method_id, phenotype_method_id, analysis_method_id, results_method_type_id)=(%s, %s, %s, %s).\n"\ %(db_entry.id, call_method_id, phenotype_method_id, analysis_method_id, results_method_type_id)) sys.exit(2) db_entry = db.getResultsMethod(data_dir=data_dir, call_method_id=call_method_id, phenotype_method_id=phenotype_method_id, \ analysis_method_id=analysis_method_id, \ cnv_method_id=cnv_method_id, accession_set_id=None, results_method_type_id=results_method_type_id,\ method_description=method_description, no_of_accessions=None, \ no_of_loci=None, filename=None, original_filename=inputFname, \ data_description=data_description, comment=comment, created_by=user, locus_type_id=locus_type_id) #2012.3.9 if commit: db_entry.filename = os.path.join(db.data_dir, db_entry.constructRelativePath(data_dir=data_dir)) localAbsPath = os.path.join(data_dir, db_entry.constructRelativePath(data_dir=data_dir)) if db_entry.analysis_method_id==13: self.srcFilenameLs.append(inputFname) self.dstFilenameLs.append(localAbsPath) exit_code = self.copyResultsFile(db, inputFname, db_entry, user=user, output_fname=localAbsPath) else: #2013.1.10 add some db_entry attributes to the hdf5 file db.addAttributesToResultFile(db_entry=db_entry, inputFname=inputFname) inputFileBasename = os.path.basename(inputFname) #moveFileIntoDBAffiliatedStorage() will also set db_entry.path exit_code = db.moveFileIntoDBAffiliatedStorage(db_entry=db_entry, filename=inputFileBasename, \ inputDir=os.path.split(inputFname)[0], \ outputDir=data_dir,\ relativeOutputDir=None, shellCommand='cp -rL', \ srcFilenameLs=self.srcFilenameLs, dstFilenameLs=self.dstFilenameLs,\ constructRelativePathFunction=db_entry.constructRelativePath, data_dir=data_dir) #exit_code = self.copyAndReformatResultFile(db, inputFname, db_entry, user=user, output_fname=localAbsPath) if exit_code==0: session.add(db_entry) if db_entry.file_size is None: db.updateDBEntryPathFileSize(db_entry=db_entry, data_dir=data_dir) if db_entry.md5sum is None: db.updateDBEntryMD5SUM(db_entry=db_entry, data_dir=data_dir) # 2010-5-3 store the json structure of top 10000 SNPs from the db_entry into db no_of_top_snps = 10000 if db_entry.analysis_method.min_maf is not None: min_MAF = db_entry.analysis_method.min_maf else: min_MAF = 0 try: #2011-2-24 if call_method_id: #call method, snp dataset db_id2chr_pos = db.snp_id2chr_pos elif cnv_method_id: if db._cnv_method_id!=cnv_method_id: db.cnv_id2chr_pos = cnv_method_id db_id2chr_pos = db.cnv_id2chr_pos pdata = PassingData(db_id2chr_pos=db_id2chr_pos) json_data = db.getOneResultJsonData(result_id=db_entry.id, min_MAF=min_MAF, no_of_top_snps=no_of_top_snps, \ pdata=pdata, data_dir=data_dir) #2011-2-24 pass pdata to getOneResultJsonData() rm_json = Stock_250kDB.ResultsMethodJson(min_maf=min_MAF, no_of_top_snps=no_of_top_snps) rm_json.result = db_entry rm_json.json_data = json_data session.add(rm_json) except: sys.stderr.write('Except in saving results_method_json (aborted): %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() session.rollback() self.cleanUpAndExitOnFailure(exitCode=3) else: #bad thing happend when getting data out of the file. don't save this results_method. session.delete(db_entry) sys.stderr.write("Error: copy file from %s to %s failed.\n"%(inputFname, localAbsPath )) session.rollback() self.cleanUpAndExitOnFailure(exitCode=3) session.flush() session.commit() else: #default is also rollback(). to demonstrate good programming session.rollback() self.reset_marker_pos2snp_id()
def run(self): """ """ db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.user, password=self.passwd, hostname=self.hostname, database=self.dbname) db.setup(create_tables=False) session = db.session if self.debug: import pdb pdb.set_trace() chr_pos2ancestral_allele = self.get_chr_pos2ancestral_allele( self.ancestral_allele_fname) pheno_data = SNPData(input_fname=self.phenotype_fname, turn_into_integer=0, ignore_2nd_column=1) pheno_data = self.process_phenotype_data(pheno_data) geno_data = SNPData(input_fname=self.genotype_fname, turn_into_array=1, matrix_data_type=int, ignore_2nd_column=1) query = Stock_250kDB.ResultsMethod.query.filter_by( call_method_id=self.call_method_id).filter_by( analysis_method_id=self.analysis_method_id).filter_by( phenotype_method_id=self.phenotype_method_id) if query.count() == 1: rm = query.first() elif query.count() > 1: sys.stderr.write( "Warning: more than 1 results_method for call_method_id=%s, analysis_method_id=%s, phenotype_method_id=%s.\n" % (self.call_method_id, self.analysis_method_id, self.phenotype_method_id)) rm = query.first() else: sys.stderr.write( "Error: no results_method for call_method_id=%s, analysis_method_id=%s, phenotype_method_id=%s.\n" % (self.call_method_id, self.analysis_method_id, self.phenotype_method_id)) sys.exit(3) phenotype_ls_data = self.get_phenotype_ls(rm, self.no_of_top_snps, chr_pos2ancestral_allele, pheno_data, geno_data, \ self.min_MAF, results_directory=self.input_dir) import pylab pylab.clf() hist_patch_ls = [] legend_ls = [] if len(phenotype_ls_data.ancestral_allele_phenotype_ls) > 2: n1 = pylab.hist(phenotype_ls_data.ancestral_allele_phenotype_ls, 100, alpha=0.4, normed=1) hist_patch_ls.append( n1[2][0]) #first patch in all patches of a histogram legend_ls.append('ancestral allele') if len(phenotype_ls_data.derived_allele_phenotype_ls) > 2: n2 = pylab.hist(phenotype_ls_data.derived_allele_phenotype_ls, 100, alpha=0.4, normed=1, facecolor='r') hist_patch_ls.append(n2[2][0]) legend_ls.append('derived allele') pylab.legend(hist_patch_ls, legend_ls) if self.output_fname_prefix: pylab.savefig('%s.svg' % self.output_fname_prefix, dpi=300)
def run(self): """ 2008-08-19 """ if self.debug: import pdb pdb.set_trace() db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) self.db_250k = db session = db.session total_gene_id_ls = get_total_gene_ls(db.metadata.bind) no_of_total_genes = len(total_gene_id_ls) #no_of_total_genes = self.getNoOfTotalGenes(db, self.gene_table, self.tax_id) #if self.commit: # session.begin() _type = self.getTopSNPTestType(self.get_closest, self.min_MAF, \ self.allow_two_sample_overlapping, self.results_type,\ self.test_type_id, self.null_distribution_type_id) snps_context_wrapper = self.dealWithSnpsContextWrapper(self.snps_context_picklef, self.min_distance, self.get_closest) pd = PassingData(list_type_id=self.list_type_id, snps_context_wrapper=snps_context_wrapper, \ no_of_total_genes=no_of_total_genes, results_directory=self.results_directory, \ min_MAF=self.min_MAF, get_closest=self.get_closest, min_distance=self.min_distance,\ no_of_top_snps=self.no_of_top_snps, min_sample_size=self.min_sample_size, test_type_id=self.test_type_id, \ results_type=self.results_type, no_of_permutations=self.no_of_permutations,\ no_of_min_breaks=self.no_of_min_breaks, type_id=_type.id,\ null_distribution_type_id=self.null_distribution_type_id,\ allow_two_sample_overlapping=self.allow_two_sample_overlapping, total_gene_id_ls=total_gene_id_ls,\ min_score=self.min_score, commit=self.commit) if getattr(self, 'output_fname', None): writer = csv.writer(open(self.output_fname, 'w'), delimiter='\t') header_row = [] for column in Stock_250kDB.CandidateGeneTopSNPTest.c.keys(): header_row.append(column) writer.writerow(header_row) else: writer = None #2008-10-31 setting up list accordingly if self.min_score: pd.min_score_ls = [self.min_score] else: pd.no_of_top_snps_ls = [self.no_of_top_snps] for results_id in self.results_id_ls: pd.results_id = results_id #self.runEnrichmentTestToGetNullData(session, pd) return_data = self.runHGTest(pd) result = return_data.result_ls[0] if result is not None: result.type = _type #assign the type here row = [] for column in result.c.keys(): row.append(getattr(result, column)) print '%s: %s'%(column, row[-1]) if writer: writer.writerow(row) session.save(result) if self.commit: session.flush()
def saveDataIntoDB(self, session, genome_wide_result_ls, hist_type, threshold_type, pvalue_matching_data, list_type_id, \ r1_pvalue_cutoff=3, r2_pvalue_cutoff=5, null_distribution_type_id=1, candidate_gene_set=set(), snps_context_wrapper=None): """ 2009-10-3 If null_distribution_type_id=2, calculate the permutation pvalue before saving into db. If null_distribution_type_id=1, pvalue = None. maybe 2X2 table test (Fisher test) 2009-4-16 """ sys.stderr.write("Saving enrichment data into db ...\n") results_id1 = genome_wide_result_ls[0].results_id results_id2 = genome_wide_result_ls[1].results_id pvalue_int_pair_set = set( pvalue_matching_data.pvalue_int_pair2count_non_candidate.keys()) pvalue_int_pair_set.update( set(pvalue_matching_data.pvalue_int_pair2count_candidate.keys())) for pvalue_int_pair in pvalue_int_pair_set: if pvalue_int_pair[0] == 0: r1_min_score = 0 r1_max_score = r1_pvalue_cutoff else: r1_min_score = r1_pvalue_cutoff r1_max_score = None if pvalue_int_pair[1] == 0: r2_min_score = 0 r2_max_score = r2_pvalue_cutoff else: r2_min_score = r2_pvalue_cutoff r2_max_score = None candidate_sample_size = pvalue_matching_data.pvalue_int_pair2count_candidate.get( pvalue_int_pair) non_candidate_sample_size = pvalue_matching_data.pvalue_int_pair2count_non_candidate.get( pvalue_int_pair) candidate_gw_size = len( pvalue_matching_data.pvalue_ls1_in_candidate) non_candidate_gw_size = len( pvalue_matching_data.pvalue_ls1_in_non_candidate) if candidate_sample_size is not None and non_candidate_sample_size is not None and candidate_gw_size > 0 and non_candidate_sample_size > 0: enrichment_ratio = ( candidate_sample_size * non_candidate_gw_size) / float( non_candidate_sample_size * candidate_gw_size) else: enrichment_ratio = None ### 2009-10-2 if null_distribution_type_id == 1: pvalue = None # need to figure out a way to calculate the pvalue , maybe 2X2 table test (Fisher test) elif null_distribution_type_id == 2: cand_snp_index_ls = pvalue_matching_data.pvalue_int_pair2cand_snp_index_ls.get( pvalue_int_pair, numpy.array([], numpy.int) ) # without numpy.array around [], [] would be treated as numpy.float64 by default. non_cand_snp_index_ls = pvalue_matching_data.pvalue_int_pair2non_cand_snp_index_ls.get( pvalue_int_pair, numpy.array([], numpy.int)) top_snp_index_ls = numpy.hstack( (cand_snp_index_ls, non_cand_snp_index_ls)) if len(top_snp_index_ls) == 0: pvalue = None else: return_data = self.get_enrichment_pvalue_by_gw_looping(candidate_sample_size, top_snp_index_ls, candidate_gene_set, \ snps_context_wrapper, \ pvalue_matching_data.no_of_total_snps, total_chr_pos_ar=pvalue_matching_data.total_chr_pos_ar, \ no_of_permutations=20000, no_of_min_breaks=30) pvalue = return_data.pvalue no_of_tests = return_data.no_of_tests no_of_tests_passed = return_data.no_of_tests_passed entry = Stock_250kDB.CmpEnrichmentOfTwoAnalysisMethods(results_id1=results_id1, results_id2=results_id2, list_type_id=list_type_id,\ type=hist_type, r1_min_score=r1_min_score, r1_max_score=r1_max_score,\ r2_min_score=r2_min_score, r2_max_score=r2_max_score,\ candidate_sample_size=candidate_sample_size, non_candidate_sample_size=non_candidate_sample_size,\ candidate_gw_size=candidate_gw_size, non_candidate_gw_size=non_candidate_gw_size,\ enrichment_ratio=enrichment_ratio, pvalue=pvalue) entry.threshold_type = threshold_type session.save(entry) #session.flush() sys.stderr.write("Done.\n")
def findCNVcontext(self, db_250k, genomeRBDict, cnv_method_id=None, compareIns=None, max_distance=50000, debug=0, param_obj=None): """ 2011-3-25 cast row.chromosome (from db) into str type. 2010-10-3 bug fixed: (chr, start, stop) is not unique. There are genes with the same coordinates. 2010-8-18 """ sys.stderr.write("Finding CNV context ... \n") session = db_250k.session TableClass = Stock_250kDB.CNV query = TableClass.query.filter_by(cnv_method_id=cnv_method_id) for row in query: segmentKey = CNVSegmentBinarySearchTreeKey(chromosome=str(row.chromosome), \ span_ls=[row.start, row.stop], \ min_reciprocal_overlap=0.0000001, ) #min_reciprocal_overlap doesn't matter here. # it's decided by compareIns. node_ls = [] genomeRBDict.findNodes(segmentKey, node_ls=node_ls, compareIns=compareIns) for node in node_ls: geneSegKey = node.key for oneGeneData in node.value: # geneSegKey.span_ls expands 20kb upstream or downstream of the gene. overlapData = get_overlap_ratio(segmentKey.span_ls, \ [oneGeneData.gene_start, oneGeneData.gene_stop]) overlapFraction1 = overlapData.overlapFraction1 overlapFraction2 = overlapData.overlapFraction2 overlap_length = overlapData.overlap_length overlap_start_pos = overlapData.overlap_start_pos overlap_stop_pos = overlapData.overlap_stop_pos if overlap_length > 0: #use fraction of length as coordinates. gene_length = oneGeneData.gene_stop - oneGeneData.gene_start + 1 try: if oneGeneData.strand == '+1': term5_disp_pos = abs(overlap_start_pos - oneGeneData.gene_start ) / float(gene_length) term3_disp_pos = abs(overlap_stop_pos - oneGeneData.gene_start + 1) / float(gene_length) else: term5_disp_pos = abs(oneGeneData.gene_stop - overlap_stop_pos) / float( gene_length) term3_disp_pos = abs(oneGeneData.gene_stop - overlap_start_pos + 1) / float(gene_length) except: import pdb pdb.set_trace() else: #no overlap at all term3_disp_pos = None if oneGeneData.strand == '+1': if row.stop <= oneGeneData.gene_start: #upstream term5_disp_pos = row.stop - oneGeneData.gene_start elif row.start >= oneGeneData.gene_stop: # downstream term5_disp_pos = row.start - oneGeneData.gene_stop else: if row.stop <= oneGeneData.gene_start: #downstream term5_disp_pos = oneGeneData.gene_start - row.stop elif row.start >= oneGeneData.gene_stop: # upstream term5_disp_pos = oneGeneData.gene_stop - row.start cnv_context = Stock_250kDB.CNVContext.query.filter_by( cnv_id=row.id).filter_by( gene_id=oneGeneData.gene_id).first() if cnv_context: param_obj.no_of_cnv_contexts_already_in_db += 1 else: cnv_context = Stock_250kDB.CNVContext(cnv_id=row.id, gene_id = oneGeneData.gene_id, \ gene_strand=oneGeneData.strand, term5_disp_pos=term5_disp_pos, \ term3_disp_pos=term3_disp_pos,\ overlap_length=overlap_length, \ overlap_fraction_in_cnv=overlapFraction1, overlap_fraction_in_gene=overlapFraction2) session.add(cnv_context) param_obj.no_of_into_db += 1 param_obj.no_of_total_contexts += 1 for geneCommentaryRBDict in oneGeneData.geneCommentaryRBDictLs: gene_box_node_ls = [] geneCommentaryRBDict.findNodes( segmentKey, node_ls=gene_box_node_ls, compareIns=compareIns) for gene_box_node in gene_box_node_ls: gene_box_key = gene_box_node.key overlapData = get_overlap_ratio( segmentKey.span_ls, gene_box_key.span_ls) overlapFraction1 = overlapData.overlapFraction1 overlapFraction2 = overlapData.overlapFraction2 overlap_length = overlapData.overlap_length overlap_start_pos = overlapData.overlap_start_pos overlap_stop_pos = overlapData.overlap_stop_pos cnv_annotation = Stock_250kDB.CNVAnnotation.query.filter_by(cnv_id=row.id).filter_by(cnv_context_id=cnv_context.id).\ filter_by(gene_commentary_id= geneCommentaryRBDict.gene_commentary_id).\ filter_by(gene_segment_id= gene_box_key.gene_segment_id).first() if cnv_annotation: param_obj.no_of_cnv_annotations_already_in_db += 1 else: cnv_annotation = Stock_250kDB.CNVAnnotation(cnv_id=row.id, \ gene_commentary_id = geneCommentaryRBDict.gene_commentary_id, \ gene_segment_id=gene_box_key.gene_segment_id, label=gene_box_key.label, \ utr_number = gene_box_key.utr_number, cds_number = gene_box_key.cds_number, \ intron_number = gene_box_key.intron_number, exon_number = gene_box_key.exon_number,\ overlap_length=overlap_length, \ overlap_fraction_in_cnv=overlapFraction1, overlap_fraction_in_gene=overlapFraction2) cnv_annotation.cnv_context = cnv_context session.add(cnv_annotation) param_obj.no_of_into_db += 1 param_obj.no_of_total_annotations += 1 if param_obj.no_of_into_db > 2000: session.flush() param_obj.no_of_into_db = 0 sys.stderr.write("\t %s/%s CNVContext(s) & %s/%s CNVAnnotation(s) already in db.\n"%(\ param_obj.no_of_cnv_contexts_already_in_db, param_obj.no_of_total_contexts, \ param_obj.no_of_cnv_annotations_already_in_db, param_obj.no_of_total_annotations)) session.flush() session.expunge_all() sys.stderr.write("\t %s/%s CNVContext(s) & %s/%s CNVAnnotation(s) already in db.\n"%(\ param_obj.no_of_cnv_contexts_already_in_db, param_obj.no_of_total_contexts, \ param_obj.no_of_cnv_annotations_already_in_db, param_obj.no_of_total_annotations))
def submit_StrainxSNP_file2db(self, curs, input_fname, call_info_table, output_dir, method_id, user, chr_pos2db_id=None, **keywords): """ 2011-5-2 curs is completely useless. keywords must contain the "db" key now. 2011-2-27 input file could use either db_id or chr_pos to identify locus. 2010-10-13 add argument chr_pos2db_id and **keywords it replaces the old snp ID (chr_pos_...) with id from table Stock_250kDB.Snps 2008-1-5 if the output_fname already exists, exit the program. if db insertion fails, delete the file written out and exit the program. 2008-05-19 submit the calls from a matrix file (Strain X SNP format, tsv, nucleotides in numbers) to db """ sys.stderr.write("Submitting %s to db ...\n" % (input_fname)) db = keywords.get("db") #2011-5-2 output_dir = os.path.join(output_dir, 'method_%s' % method_id) if not os.path.isdir(output_dir): os.makedirs(output_dir) #2013.1.18 set priorTAIRVersion =True, locus_type_id=1, because the input is in TAIR8 db.chr_pos2snp_id = (True, 1) reader = csv.reader(openGzipFile(input_fname), delimiter='\t') header = reader.next() counter = 0 for row in reader: ecotype_id, array_id = row[:2] sys.stderr.write("%s\tAssign new call info id to array id=%s ." % ('\x08' * 80, array_id)) call_info = Stock_250kDB.CallInfo(array_id=array_id, method_id=method_id, created_by=user) db.session.add(call_info) db.session.flush() output_fname = os.path.join(output_dir, '%s_call.tsv' % call_info.id) call_info.filename = output_fname db.session.add(call_info) db.session.flush() #new_call_id = self.get_new_call_id(curs, call_info_table, array_id, method_id) #if new_call_id!=-1: if os.path.isfile(output_fname): sys.stderr.write( "Error: %s already exists. Check why the file exists while db has no record.\n" % output_fname) sys.exit(2) writer = csv.writer(openGzipFile(output_fname, 'w'), delimiter='\t') writer.writerow(['SNP_ID', array_id]) for i in range(2, len(row)): snp_id = header[i] if chr_pos2db_id: #2010-10-13 db_id = db.get_db_id_given_chr_pos2db_id(snp_id) else: db_id = snp_id if db_id is not None: writer.writerow([db_id, number2nt[int(row[i])]]) #translate del writer """ try: #curs.execute("insert into %s(id, filename, array_id, method_id, created_by) values (%s, '%s', %s, %s, '%s')"%\ # (call_info_table, new_call_id, output_fname, array_id, method_id, user)) except: traceback.print_exc() sys.stderr.write('%s.\n'%repr(sys.exc_info())) sys.stderr.write('Error encountered while inserting record into %s. Delete the file written.\n'%call_info_table) commandline = 'rm %s'%(output_fname) command_out = runLocalCommand(commandline) sys.exit(3) """ counter += 1 del reader sys.stderr.write(" %s arrays. Done.\n" % counter)
def run(self): """ 2008-12-08 if the plot under configuration is already in db, abort only if the program is gonna commit the database transaction. 2008-10-19 save figures in database if commit """ if self.debug: import pdb pdb.set_trace() db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session #session.begin() if self.results_type == 1: ResultsClass = Stock_250kDB.ResultsMethod snps_context_wrapper = self.dealWithSnpsContextWrapper( self.snps_context_picklef, self.min_distance, self.get_closest) elif self.results_type == 2: ResultsClass = Stock_250kDB.ResultsByGene else: sys.stderr.write("Invalid results type : %s.\n" % self.results_type) return None hist_type = self.getHistType(self.call_method_id, self.min_distance, self.get_closest, self.min_MAF, \ self.allow_two_sample_overlapping, self.results_type, self.null_distribution_type_id) candidate_gene_list = self.getGeneList(self.list_type_id) if len(candidate_gene_list) < self.min_sample_size: sys.stderr.write("Candidate gene list of %s too small: %s.\n" % (self.list_type_id, len(candidate_gene_list))) sys.exit(4) #candidate_gene_list = [] #2009-01-12 just to plot the histogram of pvalue candidate_gene_set = Set(candidate_gene_list) list_type = Stock_250kDB.GeneListType.get(self.list_type_id) if list_type is None: sys.exit(3) phenotype_id2results_id_ls = self.getResultsIDLs(db, ResultsClass, self.results_type, self.phenotype_id_ls, \ self.min_distance, self.get_closest, self.min_MAF, self.call_method_id) param_data = PassingData(results_directory=self.data_dir, candidate_gene_list=candidate_gene_list, \ min_MAF=self.min_MAF, allow_two_sample_overlapping=self.allow_two_sample_overlapping, need_the_value=1, \ do_log10_transformation=False, data_dir=self.data_dir) #need_the_value means to get the pvalue/score #force no log10 transformation. otherwise, transformation based on analysis_method if self.null_distribution_type_id == 2 or self.null_distribution_type_id == 3: #gw-looping or random gene list snp_info = DrawSNPRegion.getSNPInfo(db) chr_pos_ls = [(row.chromosome, row.position) for row in snp_info.data_ls] candidate_gene_snp_index_ls = self.get_candidate_gene_snp_index_ls( candidate_gene_set, chr_pos_ls, snps_context_wrapper) no_of_snps = len(snp_info.data_ls) no_of_permutations = no_of_snps / len( candidate_gene_snp_index_ls) + 1 param_data.chr_pos2index = snp_info.chr_pos2index #pass to getGenomeWideResultFromFile if self.null_distribution_type_id == 2: non_candidate_gene_snp_index_ls = self.get_non_candidate_gene_snp_index_ls_by_permutation( candidate_gene_snp_index_ls, no_of_snps, no_of_permutations) elif self.null_distribution_type_id == 3: gene_id_ls = get_total_gene_ls(db.metadata.bind) no_of_candidate_genes = len(candidate_gene_set) non_candidate_gene_snp_index_ls = numpy.zeros(0, numpy.int) while len(non_candidate_gene_snp_index_ls) < no_of_snps: non_candidate_gene_set = Set( random.sample(gene_id_ls, no_of_candidate_genes)) _non_candidate_gene_snp_index_ls = self.get_candidate_gene_snp_index_ls( non_candidate_gene_set, chr_pos_ls, snps_context_wrapper) non_candidate_gene_snp_index_ls = numpy.hstack( (non_candidate_gene_snp_index_ls, _non_candidate_gene_snp_index_ls)) for phenotype_id, results_id_ls in phenotype_id2results_id_ls.iteritems( ): if hist_type.id: #hist_type already in database rows = Stock_250kDB.ScoreRankHistogram.query.filter_by(phenotype_method_id=phenotype_id).\ filter_by(list_type_id=self.list_type_id).filter_by(hist_type_id=hist_type.id) if rows.count( ) > 0 and self.commit: #2008-12-08 only skip if the database transaction is gonna commit. row = rows.first() sys.stderr.write("Histogram already in database. id=%s, phenotype_id=%s, list_type_id=%s, hist_type_id=%s.\n"%\ (row.id, row.phenotype_method_id, row.list_type_id, row.hist_type_id)) continue phenotype_method = Stock_250kDB.PhenotypeMethod.get(phenotype_id) if not phenotype_method: continue score_rank_data_ls = [] sys.stderr.write("Checking phenotype %s (%s) on list_type %s (%s) ...\n"%\ (phenotype_method.id, phenotype_method.short_name, list_type.id, list_type.short_name)) for results_id in results_id_ls: try: rm = ResultsClass.get(results_id) score_rank_data = None if self.null_distribution_type_id == 1: if self.results_type == 1: permData = self.prepareDataForPermutationRankTest( rm, snps_context_wrapper, param_data) if not permData: continue score_rank_data = PassingData(candidate_score_ls=permData.candidate_gene_snp_value_ls, \ candidate_rank_ls=permData.candidate_gene_snp_rank_ls,\ non_candidate_score_ls=permData.non_candidate_gene_snp_value_ls, non_candidate_rank_ls=permData.non_candidate_gene_snp_rank_ls,\ analysis_method=rm.analysis_method) del permData elif self.results_type == 2: score_rank_data = self.getScoreRankFromRBG( rm, candidate_gene_set, self.data_dir) elif self.null_distribution_type_id == 2 or self.null_distribution_type_id == 3: genome_wide_result = db.getResultMethodContent( rm.id, param_data.results_directory, param_data.min_MAF, pdata=param_data) if not genome_wide_result: continue score_rank_data = self.getScoreRankFromPermIndexLs( genome_wide_result, candidate_gene_snp_index_ls, non_candidate_gene_snp_index_ls) if score_rank_data: score_rank_data.analysis_method = rm.analysis_method if score_rank_data: score_rank_data_ls.append(score_rank_data) except: sys.stderr.write( "Exception happened for results_id=%s, phenotype_id=%s.\n" % (results_id, phenotype_id)) traceback.print_exc() sys.stderr.write('%s.\n' % repr(sys.exc_info())) continue if score_rank_data_ls: score_png_data, score_svg_data = self.plotHistForOnePhenotype( phenotype_method, list_type, score_rank_data_ls, self.output_dir, data_type='score', commit=self.commit) rank_png_data, rank_svg_data = self.plotHistForOnePhenotype( phenotype_method, list_type, score_rank_data_ls, self.output_dir, data_type='rank', commit=self.commit) if self.commit: score_rank_hist = Stock_250kDB.ScoreRankHistogram( phenotype_method_id=phenotype_id, list_type_id=list_type.id) score_rank_hist.hist_type = hist_type score_rank_hist.score_hist = score_png_data.getvalue() score_rank_hist.score_hist_svg = score_svg_data.getvalue() score_rank_hist.rank_hist = rank_png_data.getvalue() score_rank_hist.rank_hist_svg = rank_svg_data.getvalue() session.save(score_rank_hist) session.flush() del score_png_data, score_svg_data, rank_png_data, rank_svg_data """
def run(self): """ 2008-04-25 return None if QC_method_id==0 2008-04-20 for plone to call it just to get row_id2NA_mismatch_rate """ #database connection and etc db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.user, password=self.passwd, hostname=self.hostname, database=self.dbname) db.setup(create_tables=False) session = db.session session.begin() #transaction = session.create_transaction() self.cmp_data_filename = self.findOutCmpDataFilename( self.cmp_data_filename, self.QC_method_id, self.QCMethod_class) qm = self.QCMethod_class.query.get(self.QC_method_id) #2009-5-20 import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.user, passwd=self.passwd) curs = conn.cursor() self.curs = curs if self.debug: import pdb pdb.set_trace() readme = formReadmeObj(sys.argv, self.ad, Stock_250kDB.README) session.add(readme) QC_method_id2snps_table = self.QC_method_id2snps_table if self.QC_method_id == 0: self.cal_independent_NA_rate(db, self.min_probability, readme) row_id2NA_mismatch_rate = None else: #from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix header, strain_acc_list, category_list, data_matrix = read_data( self.cmp_data_filename, ignore_het=qm.ignore_het) strain_acc_list = map( int, strain_acc_list ) #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix, snps_table=QC_method_id2snps_table.get(self.QC_method_id),\ ignore_het=qm.ignore_het) #category_list is not used. 05/20/09 ignore_het is useless cuz data_matrix is provided. """ if self.input_dir and os.path.isdir(self.input_dir): #04/22/08 Watch: call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid) #no submission to db call_info_id2fname = self.get_array_id2fname(curs, self.input_dir) """ if self.input_dir and os.path.isfile(self.input_dir): #it's file call_info_id2fname = None else: if self.run_type == 2: #no filtering on call_info entries that have been QCed. filter_calls_QCed = 0 elif self.run_type == 1: filter_calls_QCed = 1 self.max_call_info_mismatch_rate = 1 #don't use this when doing accession-wise QC else: sys.stderr.write("run_type=%s is not supported.\n" % self.run_type) sys.exit(5) call_data = self.get_call_info_id2fname(db, self.QC_method_id, self.call_method_id, \ filter_calls_QCed, self.max_call_info_mismatch_rate, self.debug,\ min_no_of_non_NA_pairs=self.min_no_of_non_NA_pairs, input_dir=self.input_dir) call_info_id2fname = call_data.call_info_id2fname call_info_ls_to_return = call_data.call_info_ls_to_return if self.run_type == 2: snps_name2snps_id = self.get_snps_name2snps_id(db) else: snps_name2snps_id = None if call_info_id2fname: db_id2chr_pos = db.getSNPID2ChrPos() #2011-22 from DB_250k2data import DB_250k2Data db_id2index = DB_250k2Data.getSNPID2index( call_info_id2fname.values()[0][1], db_id2chr_pos) if self.one_by_one and self.run_type == 1: #one_by_one only for QC by accession row_id2NA_mismatch_rate = {} row_id12row_id2 = {} counter = 0 for call_info_id, value in call_info_id2fname.iteritems(): counter += 1 print "No", counter tmp_dict = {} tmp_dict[call_info_id] = value pdata = self.read_call_matrix( tmp_dict, self.min_probability, db_id2chr_pos=db_id2chr_pos, db_id2index=db_id2index) #05/20/09 no need for qm.ignore_het because 250k is all h**o passingdata = self.qcDataMatrixVSsnpData( pdata, snps_name2snps_id, snpData2, curs, session, readme) row_id2NA_mismatch_rate.update( passingdata.row_id2NA_mismatch_rate) row_id12row_id2.update(passingdata.row_id12row_id2) del pdata if self.debug and counter == 10: break else: pdata = self.read_call_matrix(call_info_id2fname, self.min_probability, db_id2chr_pos=db_id2chr_pos, db_id2index=db_id2index) #05/20/09 no need for qm.ignore_het because 250k is all h**o passingdata = self.qcDataMatrixVSsnpData( pdata, snps_name2snps_id, snpData2, curs, session, readme) row_id2NA_mismatch_rate = passingdata.row_id2NA_mismatch_rate row_id12row_id2 = passingdata.row_id12row_id2 del pdata else: #input file is SNP by strain format. double header (1st two lines) header, snps_name_ls, category_list, data_matrix = read_data( self.input_dir, double_header=1, ignore_het=qm.ignore_het) pdata = PassingData() pdata.ecotype_id_ls = header[0][2:] pdata.call_info_id_ls = header[1][2:] data_matrix = numpy.array(data_matrix) pdata.data_matrix = data_matrix.transpose() pdata.header = ['', '' ] + snps_name_ls #fake a header for SNPData passingdata = self.qcDataMatrixVSsnpData( pdata, snps_name2snps_id, snpData2, curs, session, readme) row_id2NA_mismatch_rate = passingdata.row_id2NA_mismatch_rate row_id12row_id2 = passingdata.row_id12row_id2 del pdata if self.output_fname and self.run_type == 1 and row_id2NA_mismatch_rate: self.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate, self.output_fname) if self.run_type == 1 and self.commit and not self.input_dir and row_id2NA_mismatch_rate: #if self.input_dir is given, no db submission. call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid) #row_id2NA_mismatch_rate might be None if it's method 0. self.submit_to_call_QC(session, row_id2NA_mismatch_rate, self.QC_method_id, self.user, self.min_probability, \ row_id12row_id2, self.call_method_id, readme) if self.commit: curs.execute("commit") session.commit() else: session.rollback() self.row_id2NA_mismatch_rate = row_id2NA_mismatch_rate #for plone to get the data structure