def getReferenceSequence(self, workflow=None, **keywords): """ 2013.3.20 yh_pegasus.registerRefFastaFile() returns a PassingData 2013.1.25 """ sys.stderr.write("Getting reference sequences ...") if workflow is None: workflow = self refSequence = VervetDB.IndividualSequence.get(self.ref_ind_seq_id) refFastaFname = os.path.join(self.data_dir, refSequence.path) registerReferenceData = yh_pegasus.registerRefFastaFile(workflow=workflow, refFastaFname=refFastaFname, \ registerAffiliateFiles=True, \ input_site_handler=self.input_site_handler,\ checkAffiliateFileExistence=True) sys.stderr.write(" %s files.\n"%(len(registerReferenceData.refFastaFList))) return registerReferenceData
def run(self): """ """ if self.debug: import pdb pdb.set_trace() db_vervet = self.db_vervet if not self.data_dir: self.data_dir = db_vervet.data_dir if not self.local_data_dir: self.local_data_dir = db_vervet.data_dir """ #without commenting out db_vervet connection code. schema "genome" wont' be default path. db_genome = GenomeDB.GenomeDatabase(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema="genome") db_genome.setup(create_tables=False) chr2size = db_genome.getTopNumberOfChomosomes(contigMaxRankBySize=80000, contigMinRankBySize=1, tax_id=60711, sequence_type_id=9) """ workflow = self.initiateWorkflow() self.registerJars(workflow) self.registerCommonExecutables(workflow) self.registerCustomExecutables(workflow) refSequence = VervetDB.IndividualSequence.get(self.ref_ind_seq_id) refFastaFname = os.path.join(self.data_dir, refSequence.path) registerReferenceData = yh_pegasus.registerRefFastaFile(workflow, refFastaFname, registerAffiliateFiles=True, \ input_site_handler=self.input_site_handler,\ checkAffiliateFileExistence=True) if self.depthFoldChange and self.depthFoldChange>0: self.outputAlignmentDepthAndOthersForFilter(db_vervet=db_vervet, outputFname=self.alnStatForFilterFname, \ ref_ind_seq_id=self.ref_ind_seq_id, \ foldChange=self.depthFoldChange, minGQ=self.minGQ) alnStatForFilterF = self.registerOneInputFile(inputFname=os.path.abspath(self.alnStatForFilterFname)) else: alnStatForFilterF = None if self.vcf1Dir: # 2012.5.1 filter only on the 1st vcf folder #a relative-path name for vcf1Dir vcf1Name = self.findProperVCFDirIdentifier(self.vcf1Dir, defaultName='vcf1') inputData = self.registerAllInputFiles(workflow, self.vcf1Dir, input_site_handler=self.input_site_handler, \ checkEmptyVCFByReading=self.checkEmptyVCFByReading,\ pegasusFolderName="%s_%s"%(self.pegasusFolderName, vcf1Name), \ maxContigID=self.maxContigID, \ minContigID=self.minContigID) vcf2PlinkJobData = self.addVCF2PlinkJobs(workflow, inputData=inputData, db_vervet=db_vervet, minMAC=None, minMAF=None,\ maxSNPMissingRate=None, transferOutput=False,\ maxContigID=self.maxContigID, outputDirPrefix="vcf2plink", outputPedigreeAsTFAM=True,\ treatEveryOneIndependent=False,\ returnMode=2, ModifyTPEDRunType=1, chr_id2cumu_chr_start=None) mendelJobData = self.addPlinkMendelErrorJobs(inputData=vcf2PlinkJobData, transferOutput=True,\ maxContigID=self.maxContigID, outputDirPrefix="mendel", locusSamplingRate=0.1, returnMode=2) locusMendelJobData = mendelJobData.jobDataLs[-1] #last job from PlinkMendelWorkflow is the locus-mendel merge job. keepSNPPosF = File('sitesWithMax%sMendelError.tsv'%(self.maxMendelError)) outputSitesBelowMaxMendelJob = self.addGenericJob(executable=self.OutputSitesBelowMaxMendelError, inputFile=locusMendelJobData.file, \ inputArgumentOption="-i", \ outputFile=keepSNPPosF, outputArgumentOption="-o", \ parentJobLs=locusMendelJobData.jobLs, extraDependentInputLs=None, extraOutputLs=None, transferOutput=True, \ extraArguments="-m %s"%(self.maxMendelError), extraArgumentList=None, job_max_memory=2000, sshDBTunnel=None, \ key2ObjectForJob=None) self.addJobsToFilterOneVCFDir(workflow, inputData=inputData, registerReferenceData=registerReferenceData, \ alnStatForFilterF=alnStatForFilterF, keepSNPPosF=keepSNPPosF, \ onlyKeepBiAllelicSNP=self.onlyKeepBiAllelicSNP,\ minMAC=self.minMAC, minMAF=self.minMAF, maxSNPMissingRate=self.maxSNPMissingRate,\ minDepthPerGenotype=self.minDepthPerGenotype, outputDirPrefix="filter",\ minNeighborDistance=self.minNeighborDistance, keepSNPPosParentJobLs=[outputSitesBelowMaxMendelJob]) # Write the DAX to stdout outf = open(self.outputFname, 'w') workflow.writeXML(outf)
def run(self): """ """ if self.debug: import pdb pdb.set_trace() #have to be in front of the db_vervet connection code. Otherwise schema "genome" wont' be default path and its visible will not be visible. db_genome = GenomeDB.GenomeDatabase(drivername=self.drivername, db_user=self.db_user, db_passwd=self.db_passwd, hostname=self.hostname, dbname=self.dbname, schema="genome") db_genome.setup(create_tables=False) chr2size = db_genome.getTopNumberOfChomosomes(contigMaxRankBySize=80000, contigMinRankBySize=1, tax_id=60711, \ sequence_type_id=9) db_vervet = self.db_vervet if not self.data_dir: self.data_dir = db_vervet.data_dir if not self.local_data_dir: self.local_data_dir = db_vervet.data_dir """ #without commenting out db_vervet connection code. schema "genome" wont' be default path. db_genome = GenomeDB.GenomeDatabase(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema="genome") db_genome.setup(create_tables=False) chr2size = db_genome.getTopNumberOfChomosomes(contigMaxRankBySize=80000, contigMinRankBySize=1, tax_id=60711, sequence_type_id=9) """ workflow = self.initiateWorkflow() self.registerJars(workflow) self.registerCommonExecutables(workflow) self.registerCustomExecutables(workflow) refSequence = VervetDB.IndividualSequence.get(self.ref_ind_seq_id) refFastaFname = os.path.join(self.data_dir, refSequence.path) registerReferenceData = yh_pegasus.registerRefFastaFile(workflow, refFastaFname, registerAffiliateFiles=True, \ input_site_handler=self.input_site_handler,\ checkAffiliateFileExistence=True) depthFoldChange = self.depthFoldChangeLs[0] if depthFoldChange>0: self.outputAlignmentDepthAndOthersForFilter(db_vervet=db_vervet, outputFname=self.alnStatForFilterFname, \ ref_ind_seq_id=self.ref_ind_seq_id, \ foldChange=depthFoldChange, minGQ=self.minGQ) alnStatForFilterF = self.registerOneInputFile(inputFname=os.path.abspath(self.alnStatForFilterFname),\ folderName=self.pegasusFolderName) else: alnStatForFilterF = None if self.keepSNPPosFname: keepSNPPosF = self.registerOneInputFile(inputFname=os.path.abspath(self.keepSNPPosFname),\ folderName=self.pegasusFolderName) else: keepSNPPosF = None # 2012.5.1 filter only on the 1st vcf folder inputData = self.registerAllInputFiles(workflow, self.inputDir, input_site_handler=self.input_site_handler, \ checkEmptyVCFByReading=self.checkEmptyVCFByReading,\ pegasusFolderName="%s"%(self.pegasusFolderName), maxContigID=self.maxContigID, \ minContigID=self.minContigID) counter =0 for minMAC in self.minMACLs: for minMAF in self.minMAFLs: for maxSNPMissingRate in self.maxSNPMissingRateLs: for minDepthPerGenotype in self.minDepthPerGenotypeLs: folderSignature='minMAC%s_minMAF%s_maxSNPMissingRate%s_minDepthPerGenotype%s_depthFoldChange%s_keepSNPFile%s_onlyBiAllelic%s'%\ (minMAC, minMAF, maxSNPMissingRate, minDepthPerGenotype, depthFoldChange, getattr(keepSNPPosF, 'name', None),\ self.onlyKeepBiAllelicSNP) filterJobData = self.addJobsToFilterOneVCFDir(workflow, inputData=inputData, registerReferenceData=registerReferenceData, \ alnStatForFilterF=alnStatForFilterF, keepSNPPosF=keepSNPPosF, \ onlyKeepBiAllelicSNP=self.onlyKeepBiAllelicSNP,\ minMAC=minMAC, minMAF=minMAF, maxSNPMissingRate=maxSNPMissingRate,\ minDepthPerGenotype=minDepthPerGenotype, transferOutput=False, \ outputDirPrefix="run%s_%s"%(counter, folderSignature)) returnData = self.addStatCalculationJobs(workflow=workflow, inputData=filterJobData, registerReferenceData=registerReferenceData, \ chr2size=chr2size, windowSize=self.windowSize, minChrLengthForPlot=self.minChrLengthForPlot, \ minChrSize=self.minChrSize, LDWindowSize=self.LDWindowSize, transferOutput=True, \ outputDirPrefix="run%s_%s"%(counter, folderSignature),\ samplingRate=self.samplingRate, minSiteGap=30000) counter += 1 # Write the DAX to stdout outf = open(self.outputFname, 'w') workflow.writeXML(outf)
def run(self): """ 2011-7-11 """ if self.debug: import pdb pdb.set_trace() db_vervet = VervetDB.VervetDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db_vervet.setup(create_tables=False) self.db_vervet = db_vervet if not self.data_dir: self.data_dir = db_vervet.data_dir if not self.local_data_dir: self.local_data_dir = db_vervet.data_dir refName2size = self.getTopNumberOfContigs(self.topNumberOfContigs, contigMinRankBySize=self.contigMinRankBySize) #refName2size = set(['Contig149']) #temporary when testing Contig149 #refName2size = set(['1MbBAC']) #temporary when testing the 1Mb-BAC (formerly vervet_path2) refNameLs = refName2size.keys() alignmentLs = self.getAlignments(self.ref_ind_seq_id, ind_seq_id_ls=self.ind_seq_id_ls, ind_aln_id_ls=self.ind_aln_id_ls,\ alignment_method_id=2, data_dir=self.local_data_dir) #site id 447 is the VRC site alignmentLs = self.filterAlignments(alignmentLs, max_coverage=self.max_coverage, individual_site_id=self.site_id) workflowName = os.path.splitext(os.path.basename(self.outputFname))[0] workflow = self.initiateWorkflow(workflowName) refSequence = VervetDB.IndividualSequence.get(self.ref_ind_seq_id) refFastaFname = os.path.join(self.data_dir, refSequence.path) registerReferenceData = yh_pegasus.registerRefFastaFile(workflow, refFastaFname, registerAffiliateFiles=True, \ input_site_handler=self.input_site_handler,\ checkAffiliateFileExistence=True) self.registerJars(workflow) self.registerCommonExecutables(workflow) self.registerCustomExecutables(workflow) if self.run_type==1: #multi-sample calling dirPrefix = "" # Add a mkdir job for the call directory. callOutputDir = "%scall"%(dirPrefix) callOutputDirJob = self.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=callOutputDir, \ namespace=workflow.namespace, version=workflow.version) gatkDir = "%sgatk"%(dirPrefix) gatkDirJob = self.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=gatkDir, \ namespace=workflow.namespace, version=workflow.version) samtoolsDir = "%ssamtools"%(dirPrefix) samtoolsDirJob = self.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=samtoolsDir, \ namespace=workflow.namespace, version=workflow.version) unionDir = "%sgatk_samtools_union"%(dirPrefix) unionDirJob = self.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=unionDir, \ namespace=workflow.namespace, version=workflow.version) intersectionDir = "%sgatk_samtools_intersection"%(dirPrefix) intersectionDirJob = self.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=intersectionDir, \ namespace=workflow.namespace, version=workflow.version) self.addGenotypeCallJobs(workflow, alignmentLs, refName2size, samtools=workflow.samtools, \ genotyperJava=workflow.genotyperJava, GenomeAnalysisTKJar=workflow.GenomeAnalysisTKJar, \ addOrReplaceReadGroupsJava=workflow.addOrReplaceReadGroupsJava, AddOrReplaceReadGroupsJar=workflow.AddOrReplaceReadGroupsJar, \ CreateSequenceDictionaryJava=workflow.CreateSequenceDictionaryJava, CreateSequenceDictionaryJar=workflow.CreateSequenceDictionaryJar, \ MergeSamFilesJar=workflow.MergeSamFilesJar, \ BuildBamIndexFilesJava=workflow.BuildBamIndexFilesJava, BuildBamIndexJar=workflow.BuildBamIndexJar, \ mv=workflow.mv, CallVariantBySamtools=workflow.CallVariantBySamtools,\ bgzip_tabix=workflow.bgzip_tabix, vcf_convert=workflow.vcf_convert, vcf_isec=workflow.vcf_isec, vcf_concat=workflow.vcf_concat, \ concatGATK=workflow.concatGATK, concatSamtools=workflow.concatSamtools,\ genotypeCallByCoverage=workflow.genotypeCallByCoverage, registerReferenceData=registerReferenceData, bamListF=None, \ callOutputDirJob =callOutputDirJob, gatkDirJob=gatkDirJob, samtoolsDirJob=samtoolsDirJob, unionDirJob=unionDirJob, intersectionDirJob=intersectionDirJob,\ namespace=workflow.namespace, version=workflow.version, site_handler=self.site_handler, input_site_handler=self.input_site_handler,\ seqCoverageF=None, \ needFastaIndexJob=self.needFastaIndexJob, needFastaDictJob=self.needFastaDictJob, \ chunkSize=2000000, site_type=self.site_type, data_dir=self.data_dir) # Write the DAX to stdout outf = open(self.outputFname, 'w') workflow.writeXML(outf)
def run(self): """ """ if self.debug: import pdb pdb.set_trace() db_vervet = self.db_vervet if not self.data_dir: self.data_dir = db_vervet.data_dir if not self.local_data_dir: self.local_data_dir = db_vervet.data_dir # Create a abstract dag workflowName = os.path.splitext(os.path.basename(self.outputFname))[0] workflow = self.initiateWorkflow(workflowName) self.registerJars(workflow) self.registerCommonExecutables(workflow) self.registerCustomExecutables(workflow) refSequence = VervetDB.IndividualSequence.get(self.ref_ind_seq_id) refFastaFname = os.path.join(self.data_dir, refSequence.path) registerReferenceData = yh_pegasus.registerRefFastaFile(workflow, refFastaFname, registerAffiliateFiles=True, \ input_site_handler=self.input_site_handler,\ checkAffiliateFileExistence=True) refFastaFList = registerReferenceData.refFastaFList self.outputAlignmentDepthAndOthersForFilter(self.alnStatForFilterFname, ref_ind_seq_id=self.ref_ind_seq_id, \ foldChange=self.depthFoldChange, minGQ=self.minGQ) alnStatForFilterF = self.registerOneInputFile(workflow, self.alnStatForFilterFname) #name to distinguish between vcf1Dir, and vcf2Dir vcf1Name = self.findProperVCFDirIdentifier(self.vcf1Dir, defaultName='vcf1') vcf2Name = self.findProperVCFDirIdentifier(self.vcf2Dir, defaultName='vcf2') if vcf2Name==vcf1Name or not vcf2Name: vcf2Name = "vcf2" no_of_jobs = 0 vcf1DepthFilterDir = "%s_DepthFilter"%(vcf1Name) vcf1DepthFilterDirJob = self.addMkDirJob(outputDir=vcf1DepthFilterDir) #vcf2DepthFilterDir = "%s_DepthFilter"%(vcf2Name) #vcf2DepthFilterDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=vcf2DepthFilterDir) trioInconsistencyDir = "trioInconsistency" trioInconsistencyDirJob = self.addMkDirJob(outputDir=trioInconsistencyDir) SNPMismatchStatDir = "SNPMismatchStat" SNPMismatchStatDirJob = self.addMkDirJob(outputDir=SNPMismatchStatDir) input_site_handler = self.input_site_handler #whole genome reduction job. wholeGenomeSiteStatFile = File('siteStatAndTrioInconsistency.tsv') wholeGenomeSiteStatMergeJob = self.addStatMergeJob(workflow, statMergeProgram=workflow.mergeSameHeaderTablesIntoOne, \ outputF=wholeGenomeSiteStatFile,transferOutput=False) wholeGenomeSiteStatBGzipFile = File("%s.gz"%wholeGenomeSiteStatFile.name) wholeGenomeSiteStatBGZipTabixJob = self.addBGZIP_tabix_Job(workflow, bgzip_tabix=workflow.bgzip_tabix, \ parentJob=wholeGenomeSiteStatMergeJob, inputF=wholeGenomeSiteStatFile, \ outputF=wholeGenomeSiteStatBGzipFile, \ transferOutput=True, tabixArguments="-s 1 -b 2 -e 2") no_of_jobs += 5 #read the trioInconsistencyByPosistionFname and figure out how many contigs in it and add an extraction job for each contig chrLs = self.getChrListInTrioInconsistencyFile(self.tabixPath, self.trioInconsistencyByPosistionFname) chr2tabixRetrieveJob = {} trioInconsistencyByPosistionF = self.registerOneInputFile(workflow, self.trioInconsistencyByPosistionFname) trioInconsistencyByPosistion_tbi_Fname = '%s.tbi'%(self.trioInconsistencyByPosistionFname) trioInconsistencyByPosistion_tbi_F = self.registerOneInputFile(workflow, trioInconsistencyByPosistion_tbi_Fname) for chr in chrLs: outputF = File(os.path.join(trioInconsistencyDir, '%s.trioInconsistency.tsv'%chr)) tabixRetrieveJob = self.addTabixRetrieveJob(workflow, executable=workflow.tabixRetrieve, tabixPath=self.tabixPath, \ inputF=trioInconsistencyByPosistionF, outputF=outputF, regionOfInterest=chr, includeHeader=True,\ parentJobLs=[trioInconsistencyDirJob], job_max_memory=100, extraDependentInputLs=[trioInconsistencyByPosistion_tbi_F], \ transferOutput=False) chr2tabixRetrieveJob[chr] = tabixRetrieveJob no_of_jobs += 1 counter = 0 no_of_vcf = 0 no_of_good_vcf = 0 for inputFname in os.listdir(self.vcf1Dir): counter += 1 if counter%500==0: sys.stderr.write("%s %s jobs %s good vcf, %s total vcf, %s total files"%('\x08'*180, no_of_jobs, \ no_of_good_vcf, no_of_vcf, counter)) vcf1AbsPath = os.path.join(os.path.abspath(self.vcf1Dir), inputFname) vcf2AbsPath = os.path.join(os.path.abspath(self.vcf2Dir), inputFname) if NextGenSeq.isFileNameVCF(inputFname, includeIndelVCF=False) and not NextGenSeq.isVCFFileEmpty(vcf1AbsPath): if not NextGenSeq.isVCFFileEmpty(vcf2AbsPath, checkContent=self.checkEmptyVCFByReading): #make sure the samtools vcf exists no_of_vcf += 1 chr = self.getChrFromFname(inputFname) if not chr or chr not in chr2tabixRetrieveJob: continue no_of_good_vcf += 1 #find the contig id and the matching tabix job commonPrefix = inputFname.split('.')[0] vcf1 = File(os.path.join(vcf1Name, inputFname)) #relative path vcf1.absPath = vcf1AbsPath self.registerVCFAndItsTabixIndex(workflow, vcf1, input_site_handler) vcf2 = File(os.path.join(vcf2Name, inputFname)) #relative path vcf2.absPath = vcf2AbsPath self.registerVCFAndItsTabixIndex(workflow, vcf2, input_site_handler) outputSiteStatF = File(os.path.join(vcf1DepthFilterDir, '%s.siteStat.tsv'%(commonPrefix))) vcf1FilterByDepthJob = self.addFilterVCFByDepthJob(workflow, FilterVCFByDepthJava=workflow.FilterVCFByDepthJava, \ GenomeAnalysisTKJar=workflow.GenomeAnalysisTKJar, \ refFastaFList=refFastaFList, inputVCFF=vcf1, outputVCFF=None, outputSiteStatF=outputSiteStatF,\ parentJobLs=[vcf1DepthFilterDirJob], \ alnStatForFilterF=alnStatForFilterF, \ extraDependentInputLs=[vcf1.tbi_F], onlyKeepBiAllelicSNP=self.onlyKeepBiAllelicSNP) snpMisMatchStatFile = File(os.path.join(SNPMismatchStatDir, '%s_snpMismatchStat.tsv'%(os.path.splitext(commonPrefix)[0]))) calculateSNPMismatchRateOfTwoVCFJob = self.addCalculateTwoVCFSNPMismatchRateJob(workflow, \ executable=workflow.CalculateSNPMismatchRateOfTwoVCF, \ vcf1=vcf1, vcf2=vcf2, snpMisMatchStatFile=snpMisMatchStatFile, \ maxSNPMismatchRate=1.0, parentJobLs=[SNPMismatchStatDirJob], \ job_max_memory=1000, extraDependentInputLs=[], \ transferOutput=False) #add a ReduceMatrixByMergeColumnsWithSameKey job chrMergingStatF = File('%s_variantSiteStatAndTrioInconsistencyRate.tsv'%(chr)) chrMergingStatJob = self.addStatMergeJob(workflow, \ statMergeProgram=workflow.ReduceMatrixByMergeColumnsWithSameKey, \ outputF=chrMergingStatF, extraArguments='-k 0,1', transferOutput=False) tabixRetrieveJob = chr2tabixRetrieveJob[chr] self.addInputToStatMergeJob(workflow, statMergeJob=chrMergingStatJob, \ inputF=tabixRetrieveJob.output, \ parentJobLs=[tabixRetrieveJob]) self.addInputToStatMergeJob(workflow, statMergeJob=chrMergingStatJob, \ inputF=outputSiteStatF, \ parentJobLs=[vcf1FilterByDepthJob]) self.addInputToStatMergeJob(workflow, statMergeJob=chrMergingStatJob, \ inputF=snpMisMatchStatFile, \ parentJobLs=[calculateSNPMismatchRateOfTwoVCFJob]) #add to the whole genome reduction job self.addInputToStatMergeJob(workflow, statMergeJob=wholeGenomeSiteStatMergeJob, \ inputF=chrMergingStatJob.output, \ parentJobLs=[chrMergingStatJob]) no_of_jobs += 3 sys.stderr.write("%s %s jobs %s good vcf, %s total vcf, %s total files.\n"%('\x08'*180, no_of_jobs, \ no_of_good_vcf, no_of_vcf, counter)) # Write the DAX to stdout outf = open(self.outputFname, 'w') workflow.writeXML(outf)