def addAllJobs(self, workflow=None, inputData=None, chr2IntervalDataLs=None, \ data_dir=None, \ intervalSize=3000, intervalOverlapSize=0, \ outputDirPrefix="", passingData=None, \ transferOutput=True, job_max_memory=2000, **keywords): """ 2013.06.14 bugfix regarding noOfUnits, which was all inferred from one file 2012.7.26 architect of the whole map-reduce framework """ sys.stderr.write("Adding jobs for %s input genome files... \n" % (len(inputData.jobDataLs))) returnData = PassingData() returnData.jobDataLs = [] #2012.9.22 # mapEachAlignmentDataLs is never reset. # mapEachChromosomeDataLs is reset upon new alignment # mapEachIntervalDataLs is reset upon each new chromosome # all reduce lists never get reset. # fileBasenamePrefix is the prefix of input file's basename, to be used for temporary output files in reduceEachInput() # but not for output files in mapEachInterval() passingData = PassingData(\ fileBasenamePrefix=None, \ chromosome=None, \ outputDirPrefix=outputDirPrefix, \ intervalFileBasenamePrefix=None,\ registerReferenceData=None, \ refFastaFList=None, \ refFastaF=None,\ fastaDictJob = None,\ refFastaDictF = None,\ fastaIndexJob = None,\ refFastaIndexF = None,\ intervalOverlapSize =intervalOverlapSize, intervalSize=intervalSize,\ jobData=None,\ splitInputFile=None,\ intervalDataLs=None,\ preReduceReturnData=None,\ mapEachIntervalData=None,\ mapEachIntervalDataLs=None,\ mapEachIntervalDataLsLs=[],\ mapEachInputData=None,\ mapEachInputDataLs=None,\ mapEachInputDataLsLs=[],\ mapEachChromosomeData=None, \ mapEachChromosomeDataLs=[], \ chromosome2mapEachIntervalDataLs = {},\ chromosome2mapEachInputDataLs = {},\ reduceEachInputData=None,\ reduceEachChromosomeData=None,\ reduceEachInputDataLs=None,\ reduceEachInputDataLsLs=[],\ reduceEachChromosomeDataLs=[],\ ) # mapEachIntervalDataLsLs is list of mapEachIntervalDataLs by each Input file. # mapEachInputDataLsLs is list of mapEachInputDataLs by each chromosome # reduceEachInputDataLsLs is list of reduceEachInputDataLs by each chromosome preReduceReturnData = self.preReduce(workflow=workflow, outputDirPrefix=outputDirPrefix, \ passingData=passingData, transferOutput=True,\ **keywords) passingData.preReduceReturnData = preReduceReturnData #gzip folder jobs (to avoid repeatedly creating the same folder gzipReduceEachInputFolderJob = None gzipReduceEachChromosomeFolderJob = None gzipReduceFolderJob = None gzipPreReduceFolderJob = None no_of_input_files = 0 firstInterval = True for chromosome, intervalDataLs in chr2IntervalDataLs.iteritems(): passingData.chromosome = chromosome mapEachChromosomeData = self.mapEachChromosome(workflow=workflow, chromosome=chromosome, \ passingData=passingData, \ transferOutput=False, **keywords) passingData.mapEachChromosomeData = mapEachChromosomeData passingData.mapEachChromosomeDataLs.append(mapEachChromosomeData) passingData.mapEachInputDataLsLs.append([]) #the last one from the double list is the current one passingData.mapEachInputDataLs = passingData.mapEachInputDataLsLs[ -1] passingData.mapEachIntervalDataLs = [] passingData.chromosome2mapEachIntervalDataLs[chromosome] = [] passingData.reduceEachInputDataLsLs.append([]) passingData.reduceEachInputDataLs = passingData.reduceEachInputDataLsLs[ -1] for i in xrange(len(inputData.jobDataLs)): jobData = inputData.jobDataLs[i] passingData.jobData = jobData passingData.inputJobData = jobData InputFile = jobData.file commonFileBasenamePrefix = utils.getFileBasenamePrefixFromPath( InputFile.name) passingData.fileBasenamePrefix = commonFileBasenamePrefix no_of_input_files += 1 if no_of_input_files % 10 == 0: sys.stderr.write("%s\t%s Inputs." % ('\x08' * 40, no_of_input_files)) for intervalData in intervalDataLs: selectIntervalJobData = self.selectIntervalFromInputFile(jobData=jobData, chromosome=chromosome,\ intervalData=intervalData, mapEachChromosomeData=mapEachChromosomeData,\ passingData=passingData, transferOutput=firstInterval,\ **keywords) mapEachIntervalData = self.mapEachInterval(workflow=workflow, inputJobData=jobData, \ selectIntervalJobData=selectIntervalJobData, \ chromosome=chromosome,intervalData=intervalData,\ mapEachChromosomeData=mapEachChromosomeData, \ passingData=passingData, transferOutput=firstInterval, **keywords) passingData.mapEachIntervalData = mapEachIntervalData passingData.mapEachIntervalDataLs.append( mapEachIntervalData) passingData.chromosome2mapEachIntervalDataLs[ chromosome].append(mapEachIntervalData) linkMapToReduceData = self.linkMapToReduce(workflow=workflow, mapEachIntervalData=mapEachIntervalData, \ preReduceReturnData=preReduceReturnData, \ passingData=passingData, \ **keywords) if firstInterval == True: firstInterval = False reduceEachInputData = self.reduceEachInput(workflow=workflow, chromosome=chromosome, passingData=passingData, \ mapEachIntervalDataLs=passingData.mapEachIntervalDataLs,\ transferOutput=False, data_dir=data_dir, \ **keywords) passingData.reduceEachInputData = reduceEachInputData passingData.reduceEachInputDataLs.append(reduceEachInputData) gzipReduceEachInputData = self.addGzipSubWorkflow(workflow=workflow, \ inputData=reduceEachInputData, transferOutput=transferOutput,\ outputDirPrefix="%sReduceEachInput"%(outputDirPrefix), topOutputDirJob=gzipReduceEachInputFolderJob, \ report=False) gzipReduceEachInputFolderJob = gzipReduceEachInputData.topOutputDirJob reduceEachChromosomeData = self.reduceEachChromosome(workflow=workflow, chromosome=chromosome, passingData=passingData, \ mapEachInputDataLs=passingData.mapEachInputDataLs, \ chromosome2mapEachIntervalDataLs=passingData.chromosome2mapEachIntervalDataLs,\ reduceEachInputDataLs=passingData.reduceEachInputDataLs,\ transferOutput=False, data_dir=data_dir, \ **keywords) passingData.reduceEachChromosomeData = reduceEachChromosomeData passingData.reduceEachChromosomeDataLs.append( reduceEachChromosomeData) gzipReduceEachChromosomeData = self.addGzipSubWorkflow(workflow=workflow, \ inputData=reduceEachChromosomeData, transferOutput=transferOutput,\ outputDirPrefix="%sReduceEachChromosome"%(outputDirPrefix), \ topOutputDirJob=gzipReduceEachChromosomeFolderJob, report=False) gzipReduceEachChromosomeFolderJob = gzipReduceEachChromosomeData.topOutputDirJob reduceReturnData = self.reduce(workflow=workflow, passingData=passingData, transferOutput=False, \ mapEachChromosomeDataLs=passingData.mapEachInputDataLs,\ reduceEachChromosomeDataLs=passingData.reduceEachChromosomeDataLs,\ **keywords) passingData.reduceReturnData = reduceReturnData if self.needGzipPreReduceReturnData: gzipPreReduceReturnData = self.addGzipSubWorkflow(workflow=workflow, inputData=preReduceReturnData, transferOutput=transferOutput,\ outputDirPrefix="%sPreReduce"%(outputDirPrefix), \ topOutputDirJob= gzipPreReduceFolderJob, report=False) gzipPreReduceFolderJob = gzipPreReduceReturnData.topOutputDirJob if self.needGzipReduceReturnData: gzipReduceReturnData = self.addGzipSubWorkflow(workflow=workflow, inputData=reduceReturnData, transferOutput=transferOutput,\ outputDirPrefix="%sReduce"%(outputDirPrefix), \ topOutputDirJob=gzipReduceFolderJob, report=False) gzipReduceFolderJob = gzipReduceReturnData.topOutputDirJob sys.stderr.write("\n %s%s Input files.\n" % ('\x08' * 40, no_of_input_files)) sys.stderr.write("%s jobs.\n" % (self.no_of_jobs)) return reduceReturnData
def addCheckingVCFOverlapSubWorkflow(self, workflow=None, chr2size=None, inputVCFData1=None, inputVCFData2=None, \ registerReferenceData=None, outputDirPrefix="", **keywords): """ 2013.09.05 """ if workflow is None: workflow = self if registerReferenceData is None: registerReferenceData = self.registerReferenceData sys.stderr.write("Adding Check-VCF overlap jobs between %s (patch 1) and %s (patch 2), job count=%s..."% (len(inputVCFData1.jobDataLs), len(inputVCFData2.jobDataLs), self.no_of_jobs)) returnData = PassingData() mapDirJob = self.addMkDirJob(outputDir="%sMap"%(outputDirPrefix)) reduceDirJob = self.addMkDirJob(outputDir="%sReduce"%(outputDirPrefix)) plotOutputDirJob = self.addMkDirJob(outputDir="%sPlot"%(outputDirPrefix)) overlapStatF = File(os.path.join(reduceDirJob.output, 'overlapSites.perChromosome.stat.tsv.gz')) overlapSitesByChromosomeMergeJob=self.addStatMergeJob(statMergeProgram=workflow.mergeSameHeaderTablesIntoOne, \ outputF=overlapStatF, parentJobLs=[reduceDirJob], \ extraDependentInputLs=None, transferOutput=True, extraArguments=None) overlapSitesMergeJob=self.addStatMergeJob(statMergeProgram=workflow.mergeSameHeaderTablesIntoOne, \ outputF=File(os.path.join(reduceDirJob.output, "overlapSites.tsv.gz")), parentJobLs=[reduceDirJob], \ extraDependentInputLs=None, transferOutput=True, extraArguments=None) perSampleMatchFractionFile = File(os.path.join(reduceDirJob.output, 'perSampleMatchFraction.tsv.gz')) perSampleMatchFractionReduceJob = self.addStatMergeJob(statMergeProgram=workflow.ReduceMatrixBySumSameKeyColsAndThenDivide, \ outputF=perSampleMatchFractionFile, parentJobLs=[reduceDirJob], extraDependentInputLs=[], transferOutput=True, \ extraArguments='-k 0 -v 1-2') returnData.perSampleMatchFractionReduceJob = perSampleMatchFractionReduceJob outputFile = File( os.path.join(plotOutputDirJob.output, 'perSampleMatchFraction_Hist.png')) #no spaces or parenthesis or any other shell-vulnerable letters in the x or y axis labels (whichColumnPlotLabel, xColumnPlotLabel) self.addDrawHistogramJob(workflow=workflow, executable=workflow.DrawHistogram, inputFileList=[perSampleMatchFractionFile], \ outputFile=outputFile, \ whichColumn=None, whichColumnHeader="no_of_matches_by_no_of_non_NA_pairs", whichColumnPlotLabel="matchFraction", \ logY=None, logCount=True, valueForNonPositiveYValue=50,\ minNoOfTotal=10,\ figureDPI=100, samplingRate=1,\ parentJobLs=[plotOutputDirJob, perSampleMatchFractionReduceJob ], \ extraDependentInputLs=None, \ extraArguments=None, transferOutput=True, job_max_memory=2000) overlapStatSumF = File(os.path.join(reduceDirJob.output, 'overlapSites.wholeGenome.stat.tsv')) overlapStatSumJob = self.addStatMergeJob(statMergeProgram=workflow.ReduceMatrixByChosenColumn, \ outputF=overlapStatSumF, parentJobLs=[reduceDirJob], extraDependentInputLs=[], transferOutput=True, \ extraArguments='-k 1000000 -v 1-25000') #The key column (-k 1000000) doesn't exist. # essentially merging every rows into one ##25000 is a random big upper limit. 100 monkeys => 101*3 + 9 => 312 columns #2012.8.17 the number of columns no longer expand as the number of samples because it's split into perSampleMatchFractionFile. self.addInputToStatMergeJob(statMergeJob=overlapStatSumJob, inputF=overlapStatF, \ parentJobLs=[overlapSitesByChromosomeMergeJob]) vcfJobDataRBTree1 = self.constructGenomeFileRBTreeByFilenameInterval(jobDataStructure=inputVCFData1, chr2size=chr2size) vcfJobDataRBTree2 = self.constructGenomeFileRBTreeByFilenameInterval(jobDataStructure=inputVCFData2, chr2size=chr2size) noOfPairs=0 for vcfJobDataNode1 in vcfJobDataRBTree1: chromosome = vcfJobDataNode1.key.chromosome chrLength = chr2size.get(chromosome) if chrLength is None: sys.stderr.write("Warning: size for chromosome %s is unknown. set it to 1000.\n"%(chromosome)) chrLength = 1000 jobData1 = vcfJobDataNode1.value vcfJobDataNodeListInTree2 = [] vcfJobDataRBTree2.findNodes(key=vcfJobDataNode1.key, node_ls=vcfJobDataNodeListInTree2) for vcfJobDataNode2 in vcfJobDataNodeListInTree2: noOfPairs += 1 jobData2 = vcfJobDataNode2.value #narrow down either VCF file based on the interval info overlap_start = max(vcfJobDataNode1.key.start, vcfJobDataNode2.key.start) overlap_stop = min(vcfJobDataNode1.key.stop, vcfJobDataNode2.key.stop) if overlap_start!=vcfJobDataNode1.key.start or overlap_stop!=vcfJobDataNode1.key.stop: fileBasenamePrefix = "%s"%(utils.getFileBasenamePrefixFromPath(jobData1.file.name)) outputF = File(os.path.join(mapDirJob.output, "%s_%s_%s_%s.vcf"%(fileBasenamePrefix, chromosome, overlap_start, overlap_stop))) selectVCF1Job = self.addSelectVariantsJob(SelectVariantsJava=self.SelectVariantsJava, \ inputF=jobData1.file, outputF=outputF, \ interval="%s:%s-%s"%(chromosome, overlap_start, overlap_stop),\ refFastaFList=registerReferenceData.refFastaFList, \ parentJobLs=[mapDirJob] + jobData1.jobLs, extraDependentInputLs=jobData1.fileLs[1:], transferOutput=False, \ extraArguments=None, extraArgumentList=None, job_max_memory=2000, walltime=None) jobData1 = self.constructJobDataFromJob(selectVCF1Job) if overlap_start!=vcfJobDataNode2.key.start or overlap_stop!=vcfJobDataNode2.key.stop: fileBasenamePrefix = "%s"%(utils.getFileBasenamePrefixFromPath(jobData2.file.name)) outputF = File(os.path.join(mapDirJob.output, "%s_%s_%s_%s.vcf"%(fileBasenamePrefix, chromosome, overlap_start, overlap_stop))) selectVCF2Job = self.addSelectVariantsJob(SelectVariantsJava=self.SelectVariantsJava, \ inputF=jobData2.file, outputF=outputF, \ interval="%s:%s-%s"%(chromosome, overlap_start, overlap_stop),\ refFastaFList=registerReferenceData.refFastaFList, \ parentJobLs=[mapDirJob] + jobData2.jobLs, extraDependentInputLs=jobData2.fileLs[1:], transferOutput=False, \ extraArguments=None, extraArgumentList=None, job_max_memory=2000, walltime=None) jobData2 = self.constructJobDataFromJob(selectVCF2Job) fileBasenamePrefix = "%s_vs_%s"%(utils.getFileBasenamePrefixFromPath(jobData1.file.name), utils.getFileBasenamePrefixFromPath(jobData2.file.name)) outputFnamePrefix = os.path.join(mapDirJob.output, fileBasenamePrefix) outputFile = File("%s.tsv.gz"%(outputFnamePrefix)) perSampleConcordanceOutputFile = File("%s_perSample.tsv.gz"%(outputFnamePrefix)) overlapSiteOutputFile = File("%s_overlapSitePos.tsv.gz"%(outputFnamePrefix)) checkTwoVCFOverlapJob = self.addCheckTwoVCFOverlapJob(executable=workflow.CheckTwoVCFOverlapCC, \ vcf1=jobData1.file, vcf2=jobData2.file, chromosome=chromosome, chrLength=chrLength, \ outputFile=outputFile, perSampleConcordanceOutputFile=perSampleConcordanceOutputFile, \ overlapSiteOutputFile=overlapSiteOutputFile,\ parentJobLs=[mapDirJob] + jobData1.jobLs + jobData2.jobLs, \ extraDependentInputLs=jobData1.fileLs[1:] + jobData2.fileLs[1:], \ transferOutput=False, extraArguments=None,\ #"--minDepth %s "%(self.minDepth),\ job_max_memory=1000) self.addInputToStatMergeJob(statMergeJob=overlapSitesByChromosomeMergeJob, \ inputF=checkTwoVCFOverlapJob.output, \ parentJobLs=[checkTwoVCFOverlapJob], extraDependentInputLs=[]) self.addInputToStatMergeJob(statMergeJob=overlapSitesMergeJob, \ inputF=checkTwoVCFOverlapJob.overlapSitePosFile, \ parentJobLs=[checkTwoVCFOverlapJob], extraDependentInputLs=[]) self.addInputToStatMergeJob(statMergeJob=perSampleMatchFractionReduceJob, \ inputF=checkTwoVCFOverlapJob.perSampleFile, \ parentJobLs=[checkTwoVCFOverlapJob], extraDependentInputLs=[]) sys.stderr.write("%s pairs of VCF files, %s jobs.\n"%(noOfPairs, self.no_of_jobs)) return returnData
def reduce(self, workflow=None, passingData=None, reduceAfterEachAlignmentDataLs=None, transferOutput=True, **keywords): """ 2013.08.14 add 2DB jobs only when their input is not empty 2012.9.17 """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] returnData.reduceAfterEachAlignmentDataLs = reduceAfterEachAlignmentDataLs reduceOutputDirJob = passingData.reduceOutputDirJob if passingData.flagStatOutputMergeJob.inputLs: #the merge job's input is not empty or None flagStat2DBLogFile = File(os.path.join(reduceOutputDirJob.output, "flagStat2DB.log")) flagStat2DBJob = self.addPutStuffIntoDBJob(workflow, executable=self.PutFlagstatOutput2DB, \ inputFileList=[passingData.flagStatOutputMergeJob.output], \ logFile=flagStat2DBLogFile, commit=True, \ parentJobLs=[reduceOutputDirJob, passingData.flagStatOutputMergeJob], \ extraDependentInputLs=[], transferOutput=True, extraArguments=None, \ job_max_memory=10, sshDBTunnel=self.needSSHDBTunnel) if passingData.depthOfCoverageOutputMergeJob.inputLs: DOC2DBLogFile = File(os.path.join(reduceOutputDirJob.output, "DOC2DB.log")) DOC2DBJob = self.addPutStuffIntoDBJob(workflow, executable=self.PutDOCOutput2DB, \ inputFileList=[passingData.depthOfCoverageOutputMergeJob.output], \ logFile=DOC2DBLogFile, commit=True, \ parentJobLs=[reduceOutputDirJob, passingData.depthOfCoverageOutputMergeJob], \ extraDependentInputLs=[], transferOutput=True, extraArguments=None, \ job_max_memory=10, sshDBTunnel=self.needSSHDBTunnel) if self.alignmentDepthJobDataList and self.alignmentDepthIntervalMethodShortName: if not self.min_segment_length: sys.stderr.write("alignmentDepthIntervalMethodShortName=%s is given but min_segment_length (%s) is not.\n"%\ (self.alignmentDepthIntervalMethodShortName, self.min_segment_length)) sys.exit(4) #2013.08.16 alignmentIDList = [pdata.alignment.id for pdata in self.alignmentDepthJobDataList] alignmentIDListInStr = utils.getSuccinctStrOutOfList(alignmentIDList) #job to add an AlignmentDepthIntervalMethod logFile = File(os.path.join(self.logOutputDirJob.output, 'AddAlignmentDepthIntervalMethod2DB.log')) addMethod2DBJob = self.addGenericFile2DBJob(executable=self.AddAlignmentDepthIntervalMethod2DB, \ inputFile=None, inputArgumentOption="-i", \ outputFile=None, outputArgumentOption="-o", \ data_dir=self.data_dir, logFile=logFile, commit=True,\ parentJobLs=[self.logOutputDirJob], extraDependentInputLs=None, extraOutputLs=None, \ transferOutput=True, extraArguments=None, \ extraArgumentList=["--methodShortName %s"%(self.alignmentDepthIntervalMethodShortName), \ "--alignmentIDList %s"%(alignmentIDListInStr),\ "--min_segment_length %s"%(self.min_segment_length)], \ job_max_memory=2000, walltime=30, sshDBTunnel=self.needSSHDBTunnel) logFile = File(os.path.join(self.logOutputDirJob.output, 'updateMethodNoOfIntervals.log')) updateMethodNoOfIntervalsJob = self.addGenericFile2DBJob(executable=self.UpdateAlignmentDepthIntervalMethodNoOfIntervals, \ data_dir=self.data_dir, logFile=logFile, commit=True,\ parentJobLs=[self.logOutputDirJob], extraDependentInputLs=None, extraOutputLs=None, \ transferOutput=True, extraArguments=None, \ extraArgumentList=["--methodShortName %s"%(self.alignmentDepthIntervalMethodShortName) ], \ job_max_memory=2000, walltime=30, sshDBTunnel=self.needSSHDBTunnel) for chromosome, chromosomeSize in self.chr2size.iteritems(): #add a ReduceSameChromosomeAlignmentDepthFiles job outputFile = File(os.path.join(reduceOutputDirJob.output, '%s_alignments_chr_%s_depth.tsv.gz'%(len(self.alignmentDepthJobDataList), chromosome))) reduceSameChromosomeAlignmentDepthFilesJob = self.addGenericJob(executable=self.ReduceSameChromosomeAlignmentDepthFiles, \ inputFile=None, outputFile=outputFile, \ parentJobLs=[reduceOutputDirJob], extraDependentInputLs=None, \ extraArgumentList=["-w 2 --chromosomePositionColumnIndex 1 --chromosomeSize %s"%(chromosomeSize)], extraOutputLs=None,\ transferOutput=False, \ key2ObjectForJob=None, job_max_memory=2000, walltime=60) for alignmentDepthJobData in self.alignmentDepthJobDataList: #add a chromosome selection job outputFile = File(os.path.join(passingData.topOutputDirJob.output, \ '%s_chr_%s.tsv.gz'%(utils.getFileBasenamePrefixFromPath(alignmentDepthJobData.file.name), chromosome))) selectRowsFromMatrixCCJob = self.addGenericJob(executable=self.SelectRowsFromMatrixCC, \ inputFile=alignmentDepthJobData.file, outputFile=outputFile, \ parentJobLs=alignmentDepthJobData.jobLs + [passingData.topOutputDirJob], extraDependentInputLs=None, \ extraArgumentList=["--inputFileSortMode 1 -w 0 --whichColumnValue %s"%(chromosome)], extraOutputLs=None,\ transferOutput=False, \ key2ObjectForJob=None, job_max_memory=1000, walltime=60) self.addInputToStatMergeJob(statMergeJob=reduceSameChromosomeAlignmentDepthFilesJob, inputF=selectRowsFromMatrixCCJob.output, \ inputArgumentOption="-i", parentJobLs=[selectRowsFromMatrixCCJob], \ extraDependentInputLs=None) #add GADA job # add segmentation jobs to figure out intervals at similar outputFile = File(os.path.join(reduceOutputDirJob.output, '%s_alignments_%s_depth_GADAOut_minSegLength%s.tsv.gz'%\ (len(self.alignmentDepthJobDataList), chromosome, self.min_segment_length))) #adjust memory based on chromosome size, 135Mb => 21.4g memory realInputVolume = chromosomeSize jobWalltime = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=60000000, baseJobPropertyValue=600, \ minJobPropertyValue=60, maxJobPropertyValue=2400).value #base is 135M, => 21G jobMaxMemory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=135000000, baseJobPropertyValue=25000, \ minJobPropertyValue=11000, maxJobPropertyValue=29000).value GADAJob = self.addGenericJob(executable=self.GADA, \ inputFile=reduceSameChromosomeAlignmentDepthFilesJob.output, outputFile=outputFile, \ parentJobLs=[reduceOutputDirJob, reduceSameChromosomeAlignmentDepthFilesJob], extraDependentInputLs=None, \ extraArgumentList=["--MinSegLen %s"%(self.min_segment_length), '--debug -T 10 -a 0.5'], extraOutputLs=None,\ transferOutput=False, \ key2ObjectForJob=None, job_max_memory=jobMaxMemory, walltime=jobWalltime) """ GADAJob = self.addGenericJob(executable=self.GADA, \ inputFile=reduceSameChromosomeAlignmentDepthFilesJob.output, outputFile=outputFile, \ parentJobLs=[reduceOutputDirJob, reduceSameChromosomeAlignmentDepthFilesJob], extraDependentInputLs=None, \ extraArgumentList=["-M %s"%(self.min_segment_length)], extraOutputLs=None,\ transferOutput=False, \ key2ObjectForJob=None, job_max_memory=10000, walltime=200) """ #job that adds AlignmentDepthIntervalFile logFile = File(os.path.join(self.logOutputDirJob.output, 'AddAlignmentDepthIntervalFile2DB_chr_%s.log'%(chromosome))) addFile2DBJob = self.addGenericFile2DBJob(executable=self.AddAlignmentDepthIntervalFile2DB, \ inputFile=GADAJob.output, \ inputArgumentOption="-i", \ inputFileList=None, argumentForEachFileInInputFileList=None,\ outputFile=None, outputArgumentOption="-o", \ data_dir=self.data_dir, logFile=logFile, commit=True,\ parentJobLs=[GADAJob, addMethod2DBJob, self.logOutputDirJob], \ extraDependentInputLs=None, extraOutputLs=None, transferOutput=True, \ extraArguments=None, \ extraArgumentList=["--methodShortName %s"%(self.alignmentDepthIntervalMethodShortName), \ "--alignmentIDList %s"%(alignmentIDListInStr), '--chromosome %s'%(chromosome),\ "--format tsv"], \ job_max_memory=2000, walltime=30, sshDBTunnel=self.needSSHDBTunnel) workflow.depends(parent=addFile2DBJob, child=updateMethodNoOfIntervalsJob) sys.stderr.write(" %s jobs, %s alignments with depth jobs, %s alignments with flagstat jobs.\n"%(self.no_of_jobs, \ self.no_of_alns_with_depth_jobs, self.no_of_alns_with_flagstat_jobs)) return returnData
def preReduce(self, workflow=None, outputDirPrefix="", passingData=None, transferOutput=True, **keywords): """ 2013.05.01 1. a job that outputs the pedigree from db, with members from the VCF file. used by various filter programs and TrioCaller 2. a job that extracts the high-coverage individuals from the VCF file 3. figure out the existence of beagle unrelated cohort, trio cohort, pair/duo cohort for high-coverage group and all individuals need the pedigree graph, a VCF file => all sample IDs and only high-coverage individuals """ returnData = AbstractVervetWorkflow.preReduce(self, workflow=workflow, outputDirPrefix=outputDirPrefix, \ passingData=passingData, transferOutput=transferOutput, **keywords) self.statDirJob = self.addMkDirJob(outputDir="%sStat"%(outputDirPrefix)) self.highCoveragePanelDirJob = self.addMkDirJob(outputDir="%sHighCoveragePanel"%(outputDirPrefix)) self.auxDirJob = self.addMkDirJob(outputDir="%sAuxilliary"%(outputDirPrefix)) self.beagleReduceDirJob = self.addMkDirJob(outputDir="%sReduceBeagle"%(outputDirPrefix)) # self.reduceOutputDirJob would contain non-replicate VCF files #this folder would store all the reduced VCF files with replicates among samles. self.replicateVCFDirJob = self.addMkDirJob(outputDir="%sReplicateVCF"%(outputDirPrefix)) self.pedigreeKinshipFile = self.registerOneInputFile(inputFname=self.pedigreeKinshipFilePath, \ folderName='aux') inputFileBasenamePrefix = utils.getFileBasenamePrefixFromPath(self.firstVCFJobData.file.name) # output pedigree to get pedigree file (for TrioCaller etc. that requires pedigree to be split into trios/duos) and sampleID2FamilyCountF # (for ReplicateVCFGenotypeColumns job, setting TrioCaller up) pedigreeFileFormat = 2 pedFile = File(os.path.join(self.auxDirJob.output, 'pedigree.replicates.%s.format%s.txt'%\ (inputFileBasenamePrefix, pedigreeFileFormat))) sampleID2FamilyCountF = File(os.path.join(self.auxDirJob.output, 'pedigree.replicates.sampleID2FamilyCount.%s.format%s.txt'%\ (inputFileBasenamePrefix, pedigreeFileFormat))) self.outputReplicatePedigreeJob = self.addOutputVRCPedigreeInTFAMGivenOrderFromFileJob(executable=self.OutputVRCPedigreeInTFAMGivenOrderFromFile, \ inputFile=self.firstVCFJobData.file, outputFile=pedFile, \ sampleID2FamilyCountF=sampleID2FamilyCountF,\ polymuttDatFile = None,\ outputFileFormat=pedigreeFileFormat, \ replicateIndividualTag=self.replicateIndividualTag,\ treatEveryOneIndependent=self.treatEveryOneIndependent,\ parentJobLs=self.firstVCFJobData.jobLs + [self.auxDirJob], \ extraDependentInputLs=None, transferOutput=True, \ extraArguments=None, job_max_memory=2000, sshDBTunnel=self.needSSHDBTunnel) #output pedigree, with no replicating certain individuals, no trio/duo splitting pedigreeFileFormat = 4 pedFile = File(os.path.join(self.auxDirJob.output, 'pedigree.%s.format%s.txt'%\ (inputFileBasenamePrefix, pedigreeFileFormat))) #sampleID2FamilyCountF = File(os.path.join(self.auxDirJob.output, 'pedigree.sampleID2FamilyCount.%s.format%s.txt'%\ # (inputFileBasenamePrefix, pedigreeFileFormat))) self.outputPedigreeJob = self.addOutputVRCPedigreeInTFAMGivenOrderFromFileJob(executable=self.OutputVRCPedigreeInTFAMGivenOrderFromFile, \ inputFile=self.firstVCFJobData.file, outputFile=pedFile, \ sampleID2FamilyCountF=None,\ polymuttDatFile = None,\ outputFileFormat=pedigreeFileFormat, \ replicateIndividualTag=self.replicateIndividualTag,\ treatEveryOneIndependent=self.treatEveryOneIndependent,\ parentJobLs=self.firstVCFJobData.jobLs + [self.auxDirJob], \ extraDependentInputLs=None, transferOutput=True, \ extraArguments=None, job_max_memory=2000, sshDBTunnel=self.needSSHDBTunnel) #ExtractSamplesFromVCF samples with coverage >=min_coverage # the input VCF does not contain replicates. outputFile = File(os.path.join(self.auxDirJob.output, '%s.minCoverage%s.sampleIDList.tsv'%\ (inputFileBasenamePrefix, self.minCoverageForRefPanel))) extractRefPanelSampleIDJob = self.addExtractSampleIDJob(inputFile=self.firstVCFJobData.file, \ outputFile=outputFile,\ min_coverage=self.minCoverageForRefPanel, outputFormat=3,\ returnData=returnData,\ transferOutput=True, \ parentJobLs=[self.firstVCFJobData.jobLs, self.auxDirJob]) self.extractRefPanelSampleIDJob = extractRefPanelSampleIDJob # GATK SelectVariants: select High-coverage individuals out into a new VCF # selectVariants would re-generate AC, AF so that TrioCaller could read it. # samtools uses 'AC1' instead of AC, 'AF1' instead of AF. # ?can it deal with Platypus output, which does not have AC/AF/DP? # selectHighCoverageSampleJob is needed here because a VCF file of high-coverage members is needed # for outputPedigreeOfHghCoverageSamplesJob # highCoverageSampleVCF = File(os.path.join(self.auxDirJob.output, '%s.minCoverage%s.vcf'%\ (inputFileBasenamePrefix, self.minCoverageForRefPanel))) selectHighCoverageSampleJob = self.addSelectVariantsJob(SelectVariantsJava=self.SelectVariantsJava, \ inputF=self.firstVCFJobData.file, \ outputF=highCoverageSampleVCF, \ refFastaFList=self.registerReferenceData.refFastaFList, \ sampleIDKeepFile=self.extractRefPanelSampleIDJob.output,\ parentJobLs=[self.auxDirJob, self.extractRefPanelSampleIDJob]+self.firstVCFJobData.jobLs, \ extraDependentInputLs=[self.firstVCFJobData.tbi_F], transferOutput=transferOutput, \ extraArguments=None, job_max_memory=2000) # output a plink pedigree that contains these HC members only # output pedigree to get pedigree file (for GATK, TrioCaller, own programs) and sampleID2FamilyCountF (for ReplicateVCFGenotypeColumns job) # find a way to cache this job (used for same set of samples, but different chromosome intervals) pedigreeFileFormat = 4 pedFile = File(os.path.join(self.auxDirJob.output, 'pedigree.minCoverage%s.%s.format%s.txt'%\ (self.minCoverageForRefPanel, inputFileBasenamePrefix, pedigreeFileFormat))) #sampleID2FamilyCountF = File(os.path.join(self.auxDirJob.output, 'pedigree.minCoverage%s.sampleID2FamilyCount.%s.format%s.txt'%\ # (self.minCoverageForRefPanel, inputFileBasenamePrefix, pedigreeFileFormat))) self.outputPedigreeOfHghCoverageSamplesJob = self.addOutputVRCPedigreeInTFAMGivenOrderFromFileJob(executable=self.OutputVRCPedigreeInTFAMGivenOrderFromFile, \ inputFile=selectHighCoverageSampleJob.output, outputFile=pedFile, \ sampleID2FamilyCountF=None,\ polymuttDatFile = None,\ outputFileFormat=pedigreeFileFormat, replicateIndividualTag=self.replicateIndividualTag,\ treatEveryOneIndependent=self.treatEveryOneIndependent,\ parentJobLs=[self.auxDirJob, selectHighCoverageSampleJob], \ extraDependentInputLs=[], transferOutput=True, \ extraArguments=None, job_max_memory=2000, sshDBTunnel=self.needSSHDBTunnel) #a job that outputs alignment coverage (alignment.read_group, median_depth) alignmentDepthFile = File(os.path.join(self.auxDirJob.folder, '%s.alignmentDepth.tsv'%(inputFileBasenamePrefix))) self.outputAlignmentDepthJob = self.addOutputVCFAlignmentDepthRangeJob(executable=self.OutputVCFAlignmentDepthRange, \ inputFile=self.firstVCFJobData.file, \ ref_ind_seq_id=self.ref_ind_seq_id, depthFoldChange=None, minGQ=None,\ outputFile=alignmentDepthFile, outputFileFormat=1,\ extraArgumentList=None,\ parentJobLs=[self.auxDirJob]+self.firstVCFJobData.jobLs, \ extraDependentInputLs=None, transferOutput=True, \ job_max_memory=2000, sshDBTunnel=self.needSSHDBTunnel) #a SelectDistantMembersFromGenotypeFile.py job to generate a ref panel for 2nd-round beagle # need the pedigree file # produces a list of samples phasedRefPanelSampleListFile = File(os.path.join(self.auxDirJob.folder, '%s.RefPanel.sampleList.maxPairwiseKinship%s.tsv'%\ (inputFileBasenamePrefix, self.maxPairwiseKinship))) self.selectDistantMembersFromGenotypeFileJob = self.addGenericJob(executable=self.SelectDistantMembersFromGenotypeFile, \ inputFile=selectHighCoverageSampleJob.output, outputFile=phasedRefPanelSampleListFile, outputArgumentOption="-o", \ extraDependentInputLs=[self.pedigreeKinshipFile], \ extraOutputLs=None, transferOutput=False, frontArgumentList=None, \ extraArguments=None, \ extraArgumentList=["--maxPairwiseKinship %s"%(self.maxPairwiseKinship), "--sampleSize 90", \ "--pedigreeKinshipFile", self.pedigreeKinshipFile, \ "--replicateIndividualTag", self.replicateIndividualTag,\ "--individualAlignmentCoverageFname", self.outputAlignmentDepthJob.output, \ "--pedigreeFname", self.outputPedigreeJob.output], \ parentJobLs=[selectHighCoverageSampleJob, self.outputAlignmentDepthJob, self.outputPedigreeJob,\ self.auxDirJob],\ no_of_cpus=None, job_max_memory = 4000, walltime= 120) """ #analyze the pedigree graph to figure out singletons, trios, duos self.alignmentLs = self.db.getAlignmentsFromVCFFile(inputFname=yh_pegasus.getAbsPathOutOfFile(self.firstVCFJobData.file)) #2013.06.14 approach below does not work because pedigree of extracting-high-coverage + replication is different from that of replication + extracting-high-coverage (=reality). # some replicates might end up as singletons in the latter, while not so in the former. # self.highCoverageAlignmentLs = self.db.filterAlignments(alignmentLs=self.alignmentLs, min_coverage=self.minCoverageForRefPanel, \ max_coverage=None, individual_site_id=None, \ sequence_filtered=None, individual_site_id_set=None, \ mask_genotype_method_id=None, parent_individual_alignment_id=None,\ country_id_set=None, tax_id_set=None, excludeContaminant=False, excludeTissueIDSet=None,\ local_realigned=None, reduce_reads=None, report=False) """ #a stat merge job (keeping track of how many mendel error sites were filtered) filterByRemoveMendelErrorSiteStatMergeFile = File(os.path.join(self.statDirJob.folder, 'filterByRemoveMendelErrorSiteStatMerge.tsv')) self.filterByRemoveMendelErrorSiteStatMergeJob = self.addStatMergeJob(statMergeProgram=workflow.ReduceMatrixByChosenColumn, \ outputF=filterByRemoveMendelErrorSiteStatMergeFile, \ transferOutput=False, parentJobLs=[self.statDirJob],\ extraArguments="--keyColumnLs 1 --valueColumnLs 2-4") #column 1 is the chromosome length, which are set to be all same. #column 2-4 are #sitesInInput1, #sitesInInput2, #overlapping returnData.jobDataLs.append(PassingData(jobLs=[self.filterByRemoveMendelErrorSiteStatMergeJob], \ fileLs=[self.filterByRemoveMendelErrorSiteStatMergeJob.output])) #concordance stat reduce jobs #reduce the replicate concordance results from before TrioCaller (after beagle phasing) # """ outputFile = File(os.path.join(self.statDirJob.folder, 'beaglePhaseReplicateConcordance.allSites.tsv')) reduceBeaglePhaseReplicateConcordanceJob_AllSites = self.addStatMergeJob(statMergeProgram=self.ReduceMatrixBySumSameKeyColsAndThenDivide, \ outputF=outputFile, \ extraArguments='--keyColumnLs 0,1 --valueColumnLs 2,3', transferOutput=False) outputFile = File(os.path.join(self.statDirJob.folder, 'beaglePhaseReplicateConcordance.h**o.tsv')) reduceBeaglePhaseReplicateConcordanceJob_HomoOnly = self.addStatMergeJob(statMergeProgram=self.ReduceMatrixBySumSameKeyColsAndThenDivide, \ outputF=outputFile, \ extraArguments='--keyColumnLs 0,1 --valueColumnLs 5,6', transferOutput=False) outputFile = File(os.path.join(self.statDirJob.folder, 'beaglePhaseReplicateConcordance.tsv')) concatenateTwoBeaglePhaseConcordanceResultJob = self.addStatMergeJob(statMergeProgram=self.ReduceMatrixByMergeColumnsWithSameKey, \ outputF=outputFile, \ extraArguments='--keyColumnLs 0,1 --valueColumnLs 2,3,4', transferOutput=False) self.addInputToStatMergeJob(statMergeJob=concatenateTwoBeaglePhaseConcordanceResultJob, \ parentJobLs=[reduceBeaglePhaseReplicateConcordanceJob_AllSites]) self.addInputToStatMergeJob(statMergeJob=concatenateTwoBeaglePhaseConcordanceResultJob, \ parentJobLs=[reduceBeaglePhaseReplicateConcordanceJob_HomoOnly]) returnData.jobDataLs.append(PassingData(jobLs=[concatenateTwoBeaglePhaseConcordanceResultJob], \ fileLs=[concatenateTwoBeaglePhaseConcordanceResultJob.output])) #pass to self, as they will be used in reduceEachVCF() self.reduceBeaglePhaseReplicateConcordanceJob_AllSites = reduceBeaglePhaseReplicateConcordanceJob_AllSites self.reduceBeaglePhaseReplicateConcordanceJob_HomoOnly = reduceBeaglePhaseReplicateConcordanceJob_HomoOnly """ #reduce replicate concordance results from after-TrioCaller VCFs outputFile = File(os.path.join(self.statDirJob.folder, 'trioCallerReplicateConcordance.allSites.tsv')) reduceTrioCallerReplicateConcordanceJob_AllSites = self.addStatMergeJob(statMergeProgram=self.ReduceMatrixBySumSameKeyColsAndThenDivide, \ outputF=outputFile, \ extraArguments='--keyColumnLs 0,1 --valueColumnLs 2,3', transferOutput=False) outputFile = File(os.path.join(self.statDirJob.folder, 'trioCallerReplicateConcordance.h**o.tsv')) reduceTrioCallerReplicateConcordanceJob_HomoOnly = self.addStatMergeJob(statMergeProgram=self.ReduceMatrixBySumSameKeyColsAndThenDivide, \ outputF=outputFile, \ extraArguments='--keyColumnLs 0,1 --valueColumnLs 5,6', transferOutput=False) outputFile = File(os.path.join(self.statDirJob.folder, 'trioCallerReplicateConcordance.tsv')) concatenateTwoTrioCallerConcordanceResultJob = self.addStatMergeJob(statMergeProgram=self.ReduceMatrixByMergeColumnsWithSameKey, \ outputF=outputFile, \ extraArguments='--keyColumnLs 0,1 --valueColumnLs 2,3,4', transferOutput=False) self.addInputToStatMergeJob(statMergeJob=concatenateTwoTrioCallerConcordanceResultJob, \ parentJobLs=[reduceTrioCallerReplicateConcordanceJob_AllSites]) self.addInputToStatMergeJob(statMergeJob=concatenateTwoTrioCallerConcordanceResultJob, \ parentJobLs=[reduceTrioCallerReplicateConcordanceJob_HomoOnly]) returnData.jobDataLs.append(PassingData(jobLs=[concatenateTwoTrioCallerConcordanceResultJob], \ fileLs=[concatenateTwoTrioCallerConcordanceResultJob.output])) #pass to self, as they will be used in reduceEachVCF() self.reduceTrioCallerReplicateConcordanceJob_AllSites = reduceTrioCallerReplicateConcordanceJob_AllSites self.reduceTrioCallerReplicateConcordanceJob_HomoOnly = reduceTrioCallerReplicateConcordanceJob_HomoOnly return returnData
def addRefineGenotypeJobsViaBeagle(self, inputFile=None, vcfBaseFname=None, outputDirJob=None, statDirJob=None, \ refFastaFList=None, intervalData=None,\ baseInputVolume=450*2000000, realInputVolume=None,\ parentJobLs=None, \ transferOutput=False, \ no_of_cpus=None, job_max_memory=2000, walltime=180, \ max_walltime=None, **keywords): returnData = PassingData() if not hasattr(self, "outputPedigreeJob"): #output pedigree, with no replicating certain individuals, no trio/duo splitting #plink format #for Beagle to read in pedigreeFileFormat = 4 inputFileBasenamePrefix = utils.getFileBasenamePrefixFromPath(inputFile.name) pedFile = File(os.path.join(outputDirJob.output, 'pedigree.%s.format%s.txt'%\ (inputFileBasenamePrefix, pedigreeFileFormat))) #sampleID2FamilyCountF = File(os.path.join(self.auxDirJob.output, 'pedigree.sampleID2FamilyCount.%s.format%s.txt'%\ # (inputFileBasenamePrefix, pedigreeFileFormat))) self.outputPedigreeJob = self.addOutputVRCPedigreeInTFAMGivenOrderFromFileJob(executable=self.OutputVRCPedigreeInTFAMGivenOrderFromFile, \ inputFile=inputFile, outputFile=pedFile, \ sampleID2FamilyCountF=None,\ polymuttDatFile = None,\ outputFileFormat=pedigreeFileFormat, \ replicateIndividualTag=self.replicateIndividualTag,\ treatEveryOneIndependent=self.treatEveryOneIndependent,\ parentJobLs=parentJobLs + [outputDirJob], \ extraDependentInputLs=None, transferOutput=True, \ extraArguments=None, job_max_memory=2000, sshDBTunnel=self.needSSHDBTunnel) ##### Part 2 run Beagle on everyone with reference panel # run Beagle #refPanelFile=selectDistantMembersVariantsJob.output,\ outputFnamePrefix = os.path.join(outputDirJob.folder, '%s.beagled'%(vcfBaseFname)) beagleJob = self.addBeagle4Job(executable=self.BeagleJava, \ inputFile=inputFile, refPanelFile=None,\ pedFile=self.outputPedigreeJob.output,\ outputFnamePrefix=outputFnamePrefix, \ burninIterations=7, phaseIterations=10, \ noOfSamplingHaplotypesPerSample=4, duoscale=2, trioscale=2, \ extraArguments=None, extraArgumentList=None,\ parentJobLs=[outputDirJob, \ self.outputPedigreeJob] + parentJobLs, \ transferOutput=False, no_of_cpus=None, \ job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \ minJobPropertyValue=4000, maxJobPropertyValue=13000).value,\ walltime= self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \ minJobPropertyValue=60, maxJobPropertyValue=1200).value,\ ) returnData.beagleJob = beagleJob #index .vcf.gz, output of beagle, without index, GATK can't work on gzipped vcf tabixIndexFile = File('%s.tbi'%(beagleJob.output.name)) tabixJob = self.addGenericJob(executable=self.tabix, \ inputFile=beagleJob.output, inputArgumentOption="",\ outputFile=None, outputArgumentOption="-o", \ extraDependentInputLs=None, \ extraOutputLs=[beagleJob.output, tabixIndexFile], transferOutput=False, \ frontArgumentList=["-p vcf"], \ extraArguments=None, \ extraArgumentList=None, \ parentJobLs=[beagleJob, outputDirJob],\ no_of_cpus=None, \ job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \ minJobPropertyValue=2000, maxJobPropertyValue=4000).value,\ walltime= self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \ minJobPropertyValue=60, maxJobPropertyValue=180).value) #borrow PL to from pre-Beagle VCF to genotype outputFile = File(os.path.join(outputDirJob.folder, '%s.beagled.withPL.vcf'%(vcfBaseFname))) combineBeagleAndPreBeagleVariantsJob = self.addGATKJob(executable=self.CombineBeagleAndPreBeagleVariantsJava, \ GenomeAnalysisTKJar=self.GenomeAnalysisTKJar, \ GATKAnalysisType="CombineBeagleAndPreBeagleVariants",\ inputFile=None, inputArgumentOption=None, \ refFastaFList=refFastaFList, \ inputFileList=None, argumentForEachFileInInputFileList="--variant",\ interval=None, outputFile=outputFile, outputArgumentOption="--out", \ frontArgumentList=None, extraArguments=None, \ extraArgumentList=["--variant:first", beagleJob.output, "--variant:second", inputFile, \ "-genotypeMergeOptions PRIORITIZE", "-priority first,second"], \ extraOutputLs=None, \ extraDependentInputLs=[inputFile] + tabixJob.outputLs, \ parentJobLs=[beagleJob, tabixJob]+ parentJobLs, transferOutput=False, \ no_of_cpus=None, \ key2ObjectForJob=None,\ job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \ minJobPropertyValue=2000, maxJobPropertyValue=4000).value,\ walltime= self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \ minJobPropertyValue=60, maxJobPropertyValue=600).value) #do not use "--variant:beagle" to name your vcf file as GATK would think it's in Beagle format returnData.refineGenotypeJob = combineBeagleAndPreBeagleVariantsJob #the final gentoype job returnData.refineGenotypeJob.intervalData = intervalData #attached so that it could be used by downstream jobs return returnData