def run(self): if self.debug: import pdb pdb.set_trace() counter = 0 no_of_vcf = 0 real_counter = 0 for inputFname in self.inputFnameLs: counter += 1 if os.path.isfile(inputFname): try: if NextGenSeq.isFileNameVCF(inputFname, includeIndelVCF=False): no_of_vcf += 1 if NextGenSeq.isVCFFileEmpty(inputFname, checkContent=self.checkEmptyVCFByReading): if self.commit: if self.report: sys.stderr.write("file %s deleted.\n"%(inputFname)) commandline = 'rm %s'%(inputFname) return_data = runLocalCommand(commandline, report_stderr=True, report_stdout=True) real_counter += 1 except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() if self.report and counter%500==0: sys.stderr.write("%s%s\t%s\t%s"%('\x08'*80, counter, no_of_vcf, real_counter)) sys.stderr.write("%s%s\t%s\t%s\n"%('\x08'*80, counter, no_of_vcf, real_counter)) sys.stderr.write("%s files in total.\n"%(counter)) sys.stderr.write("Out of %s VCF files, %s are empty and were deleted.\n"%(no_of_vcf, real_counter))
def run(self): """ 2011-7-11 """ if self.debug: import pdb pdb.set_trace() if NextGenSeq.isVCFFileEmpty(self.inputFname, checkContent=True): sys.stderr.write("Input %s doesn't exist or no variants in it.\n"%(self.inputFname)) #make sure some output files will exist for downstream jobs. self.openOutputFiles(self.outputFnamePrefix, self.windowSize) sys.exit(0) vcfFile = VCFFile(inputFname=self.inputFname, minDepth=self.minDepth) trio_col_index_data = self.findTrioIndex(vcfFile.sample_id2index, self.trio_id) father_index = trio_col_index_data.father_index mother_index = trio_col_index_data.mother_index child_index = trio_col_index_data.child_index outputDStruc = self.openOutputFiles(self.outputFnamePrefix, self.windowSize) if (father_index==-1 and mother_index!=-1) or (father_index!=-1 and mother_index==-1): #one parent is missing. it's duo. self._calculateForDuo(vcfFile, outputDStruc=outputDStruc, trio_col_index_data=trio_col_index_data) else: self._calculateForTrio(vcfFile, outputDStruc=outputDStruc, trio_col_index_data=trio_col_index_data) """
def add2DB(self, db=None, individual_alignment_id=None, inputFname=None, format=None, minDP=None, maxDP=None, minBaseQ=None, minMapQ=None,\ minRMSMapQ=None, minDistanceToIndel=None, comment=None, data_dir=None, commit=0): """ 2012.11.13 """ session = db.session session.begin() #2012.11.13 check if it's in db already db_entry = db.checkIndividualAlignmentConsensusSequence(individual_alignment_id=individual_alignment_id, minDP=minDP, \ maxDP=maxDP, minBaseQ=minBaseQ, minMapQ=minMapQ,\ minRMSMapQ=minRMSMapQ, minDistanceToIndel=minDistanceToIndel) if db_entry: sys.stderr.write("Warning: IndividualAlignmentConsensusSequence of (individual_alignment_id=%s, minDP %s, maxDP %s, etc.) already in db with id=%s.\n"%\ (individual_alignment_id, minDP, maxDP, db_entry.id)) sys.exit(3) else: countData = NextGenSeq.countNoOfChromosomesBasesInFastQFile(inputFname) no_of_chromosomes = countData.no_of_chromosomes no_of_bases = countData.no_of_bases db_entry = db.getIndividualAlignmentConsensusSequence(individual_alignment_id=individual_alignment_id, format=format, \ minDP=minDP, maxDP=maxDP, minBaseQ=minBaseQ, \ minMapQ=minMapQ, minRMSMapQ=minRMSMapQ, minDistanceToIndel=minDistanceToIndel, \ no_of_chromosomes=no_of_chromosomes,no_of_bases=no_of_bases, \ original_path=os.path.abspath(inputFname), data_dir=data_dir) if commit: inputFileBasename = os.path.basename(inputFname) #moveFileIntoDBAffiliatedStorage() will also set db_entry.path exitCode = db.moveFileIntoDBAffiliatedStorage(db_entry=db_entry, filename=inputFileBasename, \ inputDir=os.path.split(inputFname)[0], \ outputDir=data_dir,\ relativeOutputDir=None, shellCommand='cp -rL', \ srcFilenameLs=self.srcFilenameLs, dstFilenameLs=self.dstFilenameLs,\ constructRelativePathFunction=db_entry.constructRelativePath, data_dir=data_dir) if exitCode!=0: sys.stderr.write("Error: moveFileIntoDBAffiliatedStorage() exits with %s code.\n"%(exitCode)) session.rollback() self.cleanUpAndExitOnFailure(exitCode=exitCode) session.flush() session.commit() else: #default is also rollback(). to demonstrate good programming session.rollback()
def run(self): """ """ if self.debug: import pdb pdb.set_trace() db_vervet = self.db_vervet if not self.data_dir: self.data_dir = db_vervet.data_dir if not self.local_data_dir: self.local_data_dir = db_vervet.data_dir # Create a abstract dag workflowName = os.path.splitext(os.path.basename(self.outputFname))[0] workflow = self.initiateWorkflow(workflowName) self.registerJars(workflow) self.registerCommonExecutables(workflow) self.registerCustomExecutables(workflow) refSequence = VervetDB.IndividualSequence.get(self.ref_ind_seq_id) refFastaFname = os.path.join(self.data_dir, refSequence.path) registerReferenceData = yh_pegasus.registerRefFastaFile(workflow, refFastaFname, registerAffiliateFiles=True, \ input_site_handler=self.input_site_handler,\ checkAffiliateFileExistence=True) refFastaFList = registerReferenceData.refFastaFList self.outputAlignmentDepthAndOthersForFilter(self.alnStatForFilterFname, ref_ind_seq_id=self.ref_ind_seq_id, \ foldChange=self.depthFoldChange, minGQ=self.minGQ) alnStatForFilterF = self.registerOneInputFile(workflow, self.alnStatForFilterFname) #name to distinguish between vcf1Dir, and vcf2Dir vcf1Name = self.findProperVCFDirIdentifier(self.vcf1Dir, defaultName='vcf1') vcf2Name = self.findProperVCFDirIdentifier(self.vcf2Dir, defaultName='vcf2') if vcf2Name==vcf1Name or not vcf2Name: vcf2Name = "vcf2" no_of_jobs = 0 vcf1DepthFilterDir = "%s_DepthFilter"%(vcf1Name) vcf1DepthFilterDirJob = self.addMkDirJob(outputDir=vcf1DepthFilterDir) #vcf2DepthFilterDir = "%s_DepthFilter"%(vcf2Name) #vcf2DepthFilterDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=vcf2DepthFilterDir) trioInconsistencyDir = "trioInconsistency" trioInconsistencyDirJob = self.addMkDirJob(outputDir=trioInconsistencyDir) SNPMismatchStatDir = "SNPMismatchStat" SNPMismatchStatDirJob = self.addMkDirJob(outputDir=SNPMismatchStatDir) input_site_handler = self.input_site_handler #whole genome reduction job. wholeGenomeSiteStatFile = File('siteStatAndTrioInconsistency.tsv') wholeGenomeSiteStatMergeJob = self.addStatMergeJob(workflow, statMergeProgram=workflow.mergeSameHeaderTablesIntoOne, \ outputF=wholeGenomeSiteStatFile,transferOutput=False) wholeGenomeSiteStatBGzipFile = File("%s.gz"%wholeGenomeSiteStatFile.name) wholeGenomeSiteStatBGZipTabixJob = self.addBGZIP_tabix_Job(workflow, bgzip_tabix=workflow.bgzip_tabix, \ parentJob=wholeGenomeSiteStatMergeJob, inputF=wholeGenomeSiteStatFile, \ outputF=wholeGenomeSiteStatBGzipFile, \ transferOutput=True, tabixArguments="-s 1 -b 2 -e 2") no_of_jobs += 5 #read the trioInconsistencyByPosistionFname and figure out how many contigs in it and add an extraction job for each contig chrLs = self.getChrListInTrioInconsistencyFile(self.tabixPath, self.trioInconsistencyByPosistionFname) chr2tabixRetrieveJob = {} trioInconsistencyByPosistionF = self.registerOneInputFile(workflow, self.trioInconsistencyByPosistionFname) trioInconsistencyByPosistion_tbi_Fname = '%s.tbi'%(self.trioInconsistencyByPosistionFname) trioInconsistencyByPosistion_tbi_F = self.registerOneInputFile(workflow, trioInconsistencyByPosistion_tbi_Fname) for chr in chrLs: outputF = File(os.path.join(trioInconsistencyDir, '%s.trioInconsistency.tsv'%chr)) tabixRetrieveJob = self.addTabixRetrieveJob(workflow, executable=workflow.tabixRetrieve, tabixPath=self.tabixPath, \ inputF=trioInconsistencyByPosistionF, outputF=outputF, regionOfInterest=chr, includeHeader=True,\ parentJobLs=[trioInconsistencyDirJob], job_max_memory=100, extraDependentInputLs=[trioInconsistencyByPosistion_tbi_F], \ transferOutput=False) chr2tabixRetrieveJob[chr] = tabixRetrieveJob no_of_jobs += 1 counter = 0 no_of_vcf = 0 no_of_good_vcf = 0 for inputFname in os.listdir(self.vcf1Dir): counter += 1 if counter%500==0: sys.stderr.write("%s %s jobs %s good vcf, %s total vcf, %s total files"%('\x08'*180, no_of_jobs, \ no_of_good_vcf, no_of_vcf, counter)) vcf1AbsPath = os.path.join(os.path.abspath(self.vcf1Dir), inputFname) vcf2AbsPath = os.path.join(os.path.abspath(self.vcf2Dir), inputFname) if NextGenSeq.isFileNameVCF(inputFname, includeIndelVCF=False) and not NextGenSeq.isVCFFileEmpty(vcf1AbsPath): if not NextGenSeq.isVCFFileEmpty(vcf2AbsPath, checkContent=self.checkEmptyVCFByReading): #make sure the samtools vcf exists no_of_vcf += 1 chr = self.getChrFromFname(inputFname) if not chr or chr not in chr2tabixRetrieveJob: continue no_of_good_vcf += 1 #find the contig id and the matching tabix job commonPrefix = inputFname.split('.')[0] vcf1 = File(os.path.join(vcf1Name, inputFname)) #relative path vcf1.absPath = vcf1AbsPath self.registerVCFAndItsTabixIndex(workflow, vcf1, input_site_handler) vcf2 = File(os.path.join(vcf2Name, inputFname)) #relative path vcf2.absPath = vcf2AbsPath self.registerVCFAndItsTabixIndex(workflow, vcf2, input_site_handler) outputSiteStatF = File(os.path.join(vcf1DepthFilterDir, '%s.siteStat.tsv'%(commonPrefix))) vcf1FilterByDepthJob = self.addFilterVCFByDepthJob(workflow, FilterVCFByDepthJava=workflow.FilterVCFByDepthJava, \ GenomeAnalysisTKJar=workflow.GenomeAnalysisTKJar, \ refFastaFList=refFastaFList, inputVCFF=vcf1, outputVCFF=None, outputSiteStatF=outputSiteStatF,\ parentJobLs=[vcf1DepthFilterDirJob], \ alnStatForFilterF=alnStatForFilterF, \ extraDependentInputLs=[vcf1.tbi_F], onlyKeepBiAllelicSNP=self.onlyKeepBiAllelicSNP) snpMisMatchStatFile = File(os.path.join(SNPMismatchStatDir, '%s_snpMismatchStat.tsv'%(os.path.splitext(commonPrefix)[0]))) calculateSNPMismatchRateOfTwoVCFJob = self.addCalculateTwoVCFSNPMismatchRateJob(workflow, \ executable=workflow.CalculateSNPMismatchRateOfTwoVCF, \ vcf1=vcf1, vcf2=vcf2, snpMisMatchStatFile=snpMisMatchStatFile, \ maxSNPMismatchRate=1.0, parentJobLs=[SNPMismatchStatDirJob], \ job_max_memory=1000, extraDependentInputLs=[], \ transferOutput=False) #add a ReduceMatrixByMergeColumnsWithSameKey job chrMergingStatF = File('%s_variantSiteStatAndTrioInconsistencyRate.tsv'%(chr)) chrMergingStatJob = self.addStatMergeJob(workflow, \ statMergeProgram=workflow.ReduceMatrixByMergeColumnsWithSameKey, \ outputF=chrMergingStatF, extraArguments='-k 0,1', transferOutput=False) tabixRetrieveJob = chr2tabixRetrieveJob[chr] self.addInputToStatMergeJob(workflow, statMergeJob=chrMergingStatJob, \ inputF=tabixRetrieveJob.output, \ parentJobLs=[tabixRetrieveJob]) self.addInputToStatMergeJob(workflow, statMergeJob=chrMergingStatJob, \ inputF=outputSiteStatF, \ parentJobLs=[vcf1FilterByDepthJob]) self.addInputToStatMergeJob(workflow, statMergeJob=chrMergingStatJob, \ inputF=snpMisMatchStatFile, \ parentJobLs=[calculateSNPMismatchRateOfTwoVCFJob]) #add to the whole genome reduction job self.addInputToStatMergeJob(workflow, statMergeJob=wholeGenomeSiteStatMergeJob, \ inputF=chrMergingStatJob.output, \ parentJobLs=[chrMergingStatJob]) no_of_jobs += 3 sys.stderr.write("%s %s jobs %s good vcf, %s total vcf, %s total files.\n"%('\x08'*180, no_of_jobs, \ no_of_good_vcf, no_of_vcf, counter)) # Write the DAX to stdout outf = open(self.outputFname, 'w') workflow.writeXML(outf)
def run(self): """ 2012.7.13 """ if self.debug: import pdb pdb.set_trace() session = self.db_vervet.session session.begin() if not self.data_dir: self.data_dir = self.db_vervet.data_dir data_dir = self.data_dir realPath = os.path.realpath(self.inputFname) logMessage = "file %s.\n"%(self.inputFname) if NextGenSeq.isFileNameVCF(realPath, includeIndelVCF=True) and \ not NextGenSeq.isVCFFileEmpty(realPath, checkContent=self.checkEmptyVCFByReading): vcfFile = VCFFile(inputFname=self.inputFname) individualAlignmentLs = self.getAlignmentLsFromVCF(db_vervet=self.db_vervet, vcfFile=vcfFile) genotypeMethod = self.db_vervet.getGenotypeMethod(short_name=self.genotypeMethodShortName, \ individualAlignmentLs=individualAlignmentLs,\ no_of_individuals=len(individualAlignmentLs), no_of_loci=None,\ data_dir=self.data_dir) self.checkIfAlignmentListMatchMethodDBEntry(individualAlignmentLs, genotypeMethod, session) pdata = self.getNoOfLociFromVCFFile(vcfFile) chromosome2noOfLoci = pdata.chromosome2noOfLoci no_of_loci = pdata.no_of_loci if no_of_loci>0: #file with zero loci could have identical md5sum try: md5sum = utils.get_md5sum(realPath) except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() self.cleanUpAndExitOnFailure(exitCode=4) else: md5sum = None """ db_entry = VervetDB.GenotypeFile.query.filter_by(md5sum=md5sum).first() if db_entry: sys.stderr.write("Warning: another file %s with the identical md5sum %s as this file %s is already in db.\n"%\ (db_entry.path, md5sum, realPath)) session.rollback() #2012.8.3 when the jobs are clustered into one merged job and it failed halfway # and retried elsewhere, the redundancy check should not exit with non-zero. otherwise the merged job would fail again. self.cleanUpAndExitOnFailure(exitCode=0) """ no_of_individuals = len(individualAlignmentLs) no_of_chromosomes = len(chromosome2noOfLoci) if no_of_chromosomes == 1: #2012.8.30 use 1st chromosome chromosome = chromosome2noOfLoci.keys()[0] else: chromosome = None genotypeFile = self.db_vervet.getGenotypeFile(genotype_method=genotypeMethod,\ chromosome=chromosome, format=self.format, path=None, file_size=None, md5sum=md5sum,\ original_path=realPath, no_of_individuals=no_of_individuals, no_of_loci=no_of_loci,\ data_dir=self.data_dir, no_of_chromosomes=no_of_chromosomes) if genotypeFile.id and genotypeFile.path: isPathInDB = self.db_vervet.isPathInDBAffiliatedStorage(relativePath=genotypeFile.path, data_dir=self.data_dir) if isPathInDB==-1: sys.stderr.write("Error while updating genotypeFile.path with the new path, %s.\n"%(genotypeFile.path)) self.cleanUpAndExitOnFailure(exitCode=isPathInDB) elif isPathInDB==1: #successful exit, entry already in db sys.stderr.write("Warning: file %s is already in db.\n"%\ (genotypeFile.path)) session.rollback() self.cleanUpAndExitOnFailure(exitCode=0) else: #not in db affiliated storage, keep going. pass #move the file and update the db_entry's path as well inputFileBasename = os.path.basename(self.inputFname) relativePath = genotypeFile.constructRelativePath(sourceFilename=inputFileBasename) exitCode = self.db_vervet.moveFileIntoDBAffiliatedStorage(db_entry=genotypeFile, filename=inputFileBasename, \ inputDir=os.path.split(self.inputFname)[0], dstFilename=os.path.join(self.data_dir, relativePath), \ relativeOutputDir=None, shellCommand='cp -rL', \ srcFilenameLs=self.srcFilenameLs, dstFilenameLs=self.dstFilenameLs,\ constructRelativePathFunction=genotypeFile.constructRelativePath) if exitCode!=0: sys.stderr.write("Error: moveFileIntoDBAffiliatedStorage() exits with %s code.\n"%(exitCode)) session.rollback() self.cleanUpAndExitOnFailure(exitCode=exitCode) #copy the tbi (tabix) index file if it exists tbiFilename = '%s.tbi'%(realPath) if os.path.isfile(tbiFilename): srcFilename = tbiFilename dstFilename = os.path.join(self.data_dir, '%s.tbi'%(genotypeFile.path)) utils.copyFile(srcFilename=srcFilename, dstFilename=dstFilename) logMessage += "tbi file %s has been copied to %s.\n"%(srcFilename, dstFilename) ## 2012.7.17 commented out because md5sum is calcualted above #db_vervet.updateDBEntryMD5SUM(db_entry=genotypeFile, data_dir=data_dir) # #2012.7.17 record the size of db_entry.path (folder or file) self.db_vervet.updateDBEntryPathFileSize(db_entry=genotypeFile, data_dir=self.data_dir) vcfFile.close() logMessage += "%s individuals, %s loci, md5sum=%s.\n"%(no_of_individuals, no_of_loci, md5sum) else: logMessage += " is empty (no loci) or not VCF file.\n" self.outputLogMessage(logMessage) if self.commit: try: session.flush() session.commit() except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() self.cleanUpAndExitOnFailure(exitCode=3) else: session.rollback() #delete all target files but exit gracefully (exit 0) self.cleanUpAndExitOnFailure(exitCode=0)