Python getFileBasenamePrefixFromPath示例

编程语言: Python

命名空间/包名称: pymodule.utils

方法/功能: getFileBasenamePrefixFromPath

hotexamples.com的示例: 5

Python getFileBasenamePrefixFromPath - 已找到5个示例。这些是从开源项目中提取的最受好评的pymodule.utils.getFileBasenamePrefixFromPath现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： MapReduceGenomeFileWorkflow.py 项目： bopopescu/gwasmodules

    def addAllJobs(self, workflow=None, inputData=None, chr2IntervalDataLs=None, \
       data_dir=None, \
       intervalSize=3000, intervalOverlapSize=0, \
       outputDirPrefix="", passingData=None, \
       transferOutput=True, job_max_memory=2000, **keywords):
        """
		2013.06.14 bugfix regarding noOfUnits, which was all inferred from one file
		2012.7.26
			architect of the whole map-reduce framework
		"""
        sys.stderr.write("Adding jobs for %s input genome files... \n" %
                         (len(inputData.jobDataLs)))

        returnData = PassingData()
        returnData.jobDataLs = []

        #2012.9.22
        # 	mapEachAlignmentDataLs is never reset.
        #	mapEachChromosomeDataLs is reset upon new alignment
        #	mapEachIntervalDataLs is reset upon each new chromosome
        #	all reduce lists never get reset.
        #	fileBasenamePrefix is the prefix of input file's basename, to be used for temporary output files in reduceEachInput()
        #		but not for output files in mapEachInterval()
        passingData = PassingData(\
           fileBasenamePrefix=None, \
           chromosome=None, \

           outputDirPrefix=outputDirPrefix, \
           intervalFileBasenamePrefix=None,\

           registerReferenceData=None, \
           refFastaFList=None, \
           refFastaF=None,\

           fastaDictJob = None,\
           refFastaDictF = None,\
           fastaIndexJob = None,\
           refFastaIndexF = None,\

           intervalOverlapSize =intervalOverlapSize, intervalSize=intervalSize,\
           jobData=None,\
           splitInputFile=None,\
           intervalDataLs=None,\
           preReduceReturnData=None,\

           mapEachIntervalData=None,\
           mapEachIntervalDataLs=None,\
           mapEachIntervalDataLsLs=[],\
           mapEachInputData=None,\
           mapEachInputDataLs=None,\
           mapEachInputDataLsLs=[],\
           mapEachChromosomeData=None, \
           mapEachChromosomeDataLs=[], \

           chromosome2mapEachIntervalDataLs = {},\
           chromosome2mapEachInputDataLs = {},\

           reduceEachInputData=None,\
           reduceEachChromosomeData=None,\
           reduceEachInputDataLs=None,\
           reduceEachInputDataLsLs=[],\
           reduceEachChromosomeDataLs=[],\
           )
        # mapEachIntervalDataLsLs is list of mapEachIntervalDataLs by each Input file.
        # mapEachInputDataLsLs is list of mapEachInputDataLs by each chromosome
        # reduceEachInputDataLsLs is list of reduceEachInputDataLs by each chromosome

        preReduceReturnData = self.preReduce(workflow=workflow, outputDirPrefix=outputDirPrefix, \
               passingData=passingData, transferOutput=True,\
               **keywords)
        passingData.preReduceReturnData = preReduceReturnData

        #gzip folder jobs (to avoid repeatedly creating the same folder
        gzipReduceEachInputFolderJob = None
        gzipReduceEachChromosomeFolderJob = None
        gzipReduceFolderJob = None
        gzipPreReduceFolderJob = None
        no_of_input_files = 0

        firstInterval = True

        for chromosome, intervalDataLs in chr2IntervalDataLs.iteritems():
            passingData.chromosome = chromosome
            mapEachChromosomeData = self.mapEachChromosome(workflow=workflow, chromosome=chromosome, \
                   passingData=passingData, \
                   transferOutput=False, **keywords)
            passingData.mapEachChromosomeData = mapEachChromosomeData
            passingData.mapEachChromosomeDataLs.append(mapEachChromosomeData)

            passingData.mapEachInputDataLsLs.append([])
            #the last one from the double list is the current one
            passingData.mapEachInputDataLs = passingData.mapEachInputDataLsLs[
                -1]
            passingData.mapEachIntervalDataLs = []
            passingData.chromosome2mapEachIntervalDataLs[chromosome] = []

            passingData.reduceEachInputDataLsLs.append([])
            passingData.reduceEachInputDataLs = passingData.reduceEachInputDataLsLs[
                -1]

            for i in xrange(len(inputData.jobDataLs)):
                jobData = inputData.jobDataLs[i]
                passingData.jobData = jobData
                passingData.inputJobData = jobData

                InputFile = jobData.file
                commonFileBasenamePrefix = utils.getFileBasenamePrefixFromPath(
                    InputFile.name)
                passingData.fileBasenamePrefix = commonFileBasenamePrefix

                no_of_input_files += 1
                if no_of_input_files % 10 == 0:
                    sys.stderr.write("%s\t%s Inputs." %
                                     ('\x08' * 40, no_of_input_files))

                for intervalData in intervalDataLs:
                    selectIntervalJobData = self.selectIntervalFromInputFile(jobData=jobData, chromosome=chromosome,\
                            intervalData=intervalData, mapEachChromosomeData=mapEachChromosomeData,\
                            passingData=passingData, transferOutput=firstInterval,\
                            **keywords)
                    mapEachIntervalData = self.mapEachInterval(workflow=workflow, inputJobData=jobData, \
                              selectIntervalJobData=selectIntervalJobData, \
                         chromosome=chromosome,intervalData=intervalData,\
                         mapEachChromosomeData=mapEachChromosomeData, \
                         passingData=passingData, transferOutput=firstInterval, **keywords)

                    passingData.mapEachIntervalData = mapEachIntervalData
                    passingData.mapEachIntervalDataLs.append(
                        mapEachIntervalData)
                    passingData.chromosome2mapEachIntervalDataLs[
                        chromosome].append(mapEachIntervalData)

                    linkMapToReduceData = self.linkMapToReduce(workflow=workflow, mapEachIntervalData=mapEachIntervalData, \
                         preReduceReturnData=preReduceReturnData, \
                         passingData=passingData, \
                         **keywords)
                    if firstInterval == True:
                        firstInterval = False
                reduceEachInputData = self.reduceEachInput(workflow=workflow, chromosome=chromosome, passingData=passingData, \
                    mapEachIntervalDataLs=passingData.mapEachIntervalDataLs,\
                    transferOutput=False, data_dir=data_dir, \
                    **keywords)
                passingData.reduceEachInputData = reduceEachInputData
                passingData.reduceEachInputDataLs.append(reduceEachInputData)

                gzipReduceEachInputData = self.addGzipSubWorkflow(workflow=workflow, \
                 inputData=reduceEachInputData, transferOutput=transferOutput,\
                 outputDirPrefix="%sReduceEachInput"%(outputDirPrefix), topOutputDirJob=gzipReduceEachInputFolderJob, \
                 report=False)
                gzipReduceEachInputFolderJob = gzipReduceEachInputData.topOutputDirJob
            reduceEachChromosomeData = self.reduceEachChromosome(workflow=workflow, chromosome=chromosome, passingData=passingData, \
                 mapEachInputDataLs=passingData.mapEachInputDataLs, \
                 chromosome2mapEachIntervalDataLs=passingData.chromosome2mapEachIntervalDataLs,\
                 reduceEachInputDataLs=passingData.reduceEachInputDataLs,\
                 transferOutput=False, data_dir=data_dir, \
                 **keywords)
            passingData.reduceEachChromosomeData = reduceEachChromosomeData
            passingData.reduceEachChromosomeDataLs.append(
                reduceEachChromosomeData)

            gzipReduceEachChromosomeData = self.addGzipSubWorkflow(workflow=workflow, \
              inputData=reduceEachChromosomeData, transferOutput=transferOutput,\
              outputDirPrefix="%sReduceEachChromosome"%(outputDirPrefix), \
              topOutputDirJob=gzipReduceEachChromosomeFolderJob, report=False)
            gzipReduceEachChromosomeFolderJob = gzipReduceEachChromosomeData.topOutputDirJob

        reduceReturnData = self.reduce(workflow=workflow, passingData=passingData, transferOutput=False, \
             mapEachChromosomeDataLs=passingData.mapEachInputDataLs,\
             reduceEachChromosomeDataLs=passingData.reduceEachChromosomeDataLs,\
             **keywords)
        passingData.reduceReturnData = reduceReturnData

        if self.needGzipPreReduceReturnData:
            gzipPreReduceReturnData = self.addGzipSubWorkflow(workflow=workflow, inputData=preReduceReturnData, transferOutput=transferOutput,\
               outputDirPrefix="%sPreReduce"%(outputDirPrefix), \
               topOutputDirJob= gzipPreReduceFolderJob, report=False)
            gzipPreReduceFolderJob = gzipPreReduceReturnData.topOutputDirJob

        if self.needGzipReduceReturnData:
            gzipReduceReturnData = self.addGzipSubWorkflow(workflow=workflow, inputData=reduceReturnData, transferOutput=transferOutput,\
               outputDirPrefix="%sReduce"%(outputDirPrefix), \
               topOutputDirJob=gzipReduceFolderJob, report=False)
            gzipReduceFolderJob = gzipReduceReturnData.topOutputDirJob

        sys.stderr.write("\n %s%s Input files.\n" %
                         ('\x08' * 40, no_of_input_files))
        sys.stderr.write("%s jobs.\n" % (self.no_of_jobs))
        return reduceReturnData

示例#2

显示文件

文件： CheckTwoVCFOverlapPipeline.py 项目： mjmontague/vervet-web

	def addCheckingVCFOverlapSubWorkflow(self, workflow=None, chr2size=None, inputVCFData1=None, inputVCFData2=None, \
					registerReferenceData=None, outputDirPrefix="", **keywords):
		"""
		2013.09.05
		"""
		if workflow is None:
			workflow = self
		if registerReferenceData is None:
			registerReferenceData = self.registerReferenceData
		
		sys.stderr.write("Adding Check-VCF overlap jobs between %s (patch 1) and %s (patch 2), job count=%s..."%
						(len(inputVCFData1.jobDataLs), len(inputVCFData2.jobDataLs), self.no_of_jobs))
		returnData = PassingData()
		
		mapDirJob = self.addMkDirJob(outputDir="%sMap"%(outputDirPrefix))
		reduceDirJob = self.addMkDirJob(outputDir="%sReduce"%(outputDirPrefix))
		plotOutputDirJob = self.addMkDirJob(outputDir="%sPlot"%(outputDirPrefix))
		
		overlapStatF = File(os.path.join(reduceDirJob.output, 'overlapSites.perChromosome.stat.tsv.gz'))
		overlapSitesByChromosomeMergeJob=self.addStatMergeJob(statMergeProgram=workflow.mergeSameHeaderTablesIntoOne, \
					outputF=overlapStatF, parentJobLs=[reduceDirJob], \
					extraDependentInputLs=None, transferOutput=True, extraArguments=None)
		
		overlapSitesMergeJob=self.addStatMergeJob(statMergeProgram=workflow.mergeSameHeaderTablesIntoOne, \
					outputF=File(os.path.join(reduceDirJob.output, "overlapSites.tsv.gz")), parentJobLs=[reduceDirJob], \
					extraDependentInputLs=None, transferOutput=True, extraArguments=None)
		
		perSampleMatchFractionFile = File(os.path.join(reduceDirJob.output, 'perSampleMatchFraction.tsv.gz'))
		perSampleMatchFractionReduceJob = self.addStatMergeJob(statMergeProgram=workflow.ReduceMatrixBySumSameKeyColsAndThenDivide, \
					outputF=perSampleMatchFractionFile, parentJobLs=[reduceDirJob], extraDependentInputLs=[], transferOutput=True, \
					extraArguments='-k 0 -v 1-2')
		returnData.perSampleMatchFractionReduceJob = perSampleMatchFractionReduceJob
		
		outputFile = File( os.path.join(plotOutputDirJob.output, 'perSampleMatchFraction_Hist.png'))
		#no spaces or parenthesis or any other shell-vulnerable letters in the x or y axis labels (whichColumnPlotLabel, xColumnPlotLabel)
		self.addDrawHistogramJob(workflow=workflow, executable=workflow.DrawHistogram, inputFileList=[perSampleMatchFractionFile], \
							outputFile=outputFile, \
					whichColumn=None, whichColumnHeader="no_of_matches_by_no_of_non_NA_pairs", whichColumnPlotLabel="matchFraction", \
					logY=None, logCount=True, valueForNonPositiveYValue=50,\
					minNoOfTotal=10,\
					figureDPI=100, samplingRate=1,\
					parentJobLs=[plotOutputDirJob, perSampleMatchFractionReduceJob ], \
					extraDependentInputLs=None, \
					extraArguments=None, transferOutput=True,  job_max_memory=2000)
		
		overlapStatSumF = File(os.path.join(reduceDirJob.output, 'overlapSites.wholeGenome.stat.tsv'))
		overlapStatSumJob = self.addStatMergeJob(statMergeProgram=workflow.ReduceMatrixByChosenColumn, \
						outputF=overlapStatSumF, parentJobLs=[reduceDirJob], extraDependentInputLs=[], transferOutput=True, \
						extraArguments='-k 1000000 -v 1-25000')	#The key column (-k 1000000) doesn't exist.
						# essentially merging every rows into one 
						##25000 is a random big upper limit. 100 monkeys => 101*3 + 9 => 312 columns
						#2012.8.17 the number of columns no longer expand as the number of samples because it's split into perSampleMatchFractionFile.
		self.addInputToStatMergeJob(statMergeJob=overlapStatSumJob, inputF=overlapStatF, \
							parentJobLs=[overlapSitesByChromosomeMergeJob])
		
		vcfJobDataRBTree1 = self.constructGenomeFileRBTreeByFilenameInterval(jobDataStructure=inputVCFData1, chr2size=chr2size)
		vcfJobDataRBTree2 = self.constructGenomeFileRBTreeByFilenameInterval(jobDataStructure=inputVCFData2, chr2size=chr2size)
		
		noOfPairs=0
		for vcfJobDataNode1 in vcfJobDataRBTree1:
			chromosome = vcfJobDataNode1.key.chromosome
			chrLength = chr2size.get(chromosome)
			if chrLength is None:
				sys.stderr.write("Warning: size for chromosome %s is unknown. set it to 1000.\n"%(chromosome))
				chrLength = 1000
			jobData1 = vcfJobDataNode1.value
			
			vcfJobDataNodeListInTree2 = []
			vcfJobDataRBTree2.findNodes(key=vcfJobDataNode1.key, node_ls=vcfJobDataNodeListInTree2)
			for vcfJobDataNode2 in vcfJobDataNodeListInTree2:
				noOfPairs += 1
				jobData2 = vcfJobDataNode2.value
				
				#narrow down either VCF file based on the interval info
				overlap_start = max(vcfJobDataNode1.key.start, vcfJobDataNode2.key.start)
				overlap_stop = min(vcfJobDataNode1.key.stop, vcfJobDataNode2.key.stop)
				if overlap_start!=vcfJobDataNode1.key.start or overlap_stop!=vcfJobDataNode1.key.stop:
					fileBasenamePrefix = "%s"%(utils.getFileBasenamePrefixFromPath(jobData1.file.name))
					outputF = File(os.path.join(mapDirJob.output, "%s_%s_%s_%s.vcf"%(fileBasenamePrefix, chromosome, overlap_start, overlap_stop)))
					selectVCF1Job = self.addSelectVariantsJob(SelectVariantsJava=self.SelectVariantsJava, \
										inputF=jobData1.file, outputF=outputF, \
										interval="%s:%s-%s"%(chromosome, overlap_start, overlap_stop),\
										refFastaFList=registerReferenceData.refFastaFList, \
										parentJobLs=[mapDirJob] + jobData1.jobLs, extraDependentInputLs=jobData1.fileLs[1:], transferOutput=False, \
										extraArguments=None, extraArgumentList=None, job_max_memory=2000, walltime=None)
					jobData1 = self.constructJobDataFromJob(selectVCF1Job)
				if overlap_start!=vcfJobDataNode2.key.start or overlap_stop!=vcfJobDataNode2.key.stop:
					fileBasenamePrefix = "%s"%(utils.getFileBasenamePrefixFromPath(jobData2.file.name))
					outputF = File(os.path.join(mapDirJob.output, "%s_%s_%s_%s.vcf"%(fileBasenamePrefix, chromosome, overlap_start, overlap_stop)))
					selectVCF2Job = self.addSelectVariantsJob(SelectVariantsJava=self.SelectVariantsJava, \
										inputF=jobData2.file, outputF=outputF, \
										interval="%s:%s-%s"%(chromosome, overlap_start, overlap_stop),\
										refFastaFList=registerReferenceData.refFastaFList, \
										parentJobLs=[mapDirJob] + jobData2.jobLs, extraDependentInputLs=jobData2.fileLs[1:], transferOutput=False, \
										extraArguments=None, extraArgumentList=None, job_max_memory=2000, walltime=None)
					jobData2 = self.constructJobDataFromJob(selectVCF2Job)
				
				fileBasenamePrefix = "%s_vs_%s"%(utils.getFileBasenamePrefixFromPath(jobData1.file.name), 
												utils.getFileBasenamePrefixFromPath(jobData2.file.name))
				
				outputFnamePrefix = os.path.join(mapDirJob.output, fileBasenamePrefix)
				outputFile = File("%s.tsv.gz"%(outputFnamePrefix))
				perSampleConcordanceOutputFile = File("%s_perSample.tsv.gz"%(outputFnamePrefix))
				overlapSiteOutputFile = File("%s_overlapSitePos.tsv.gz"%(outputFnamePrefix))
				checkTwoVCFOverlapJob = self.addCheckTwoVCFOverlapJob(executable=workflow.CheckTwoVCFOverlapCC, \
						vcf1=jobData1.file, vcf2=jobData2.file, chromosome=chromosome, chrLength=chrLength, \
						outputFile=outputFile, perSampleConcordanceOutputFile=perSampleConcordanceOutputFile, \
						overlapSiteOutputFile=overlapSiteOutputFile,\
						parentJobLs=[mapDirJob] + jobData1.jobLs + jobData2.jobLs, \
						extraDependentInputLs=jobData1.fileLs[1:] + jobData2.fileLs[1:], \
						transferOutput=False, extraArguments=None,\
						#"--minDepth %s "%(self.minDepth),\
						job_max_memory=1000)
				
				self.addInputToStatMergeJob(statMergeJob=overlapSitesByChromosomeMergeJob, \
							inputF=checkTwoVCFOverlapJob.output, \
							parentJobLs=[checkTwoVCFOverlapJob], extraDependentInputLs=[])
				self.addInputToStatMergeJob(statMergeJob=overlapSitesMergeJob, \
							inputF=checkTwoVCFOverlapJob.overlapSitePosFile, \
							parentJobLs=[checkTwoVCFOverlapJob], extraDependentInputLs=[])
				self.addInputToStatMergeJob(statMergeJob=perSampleMatchFractionReduceJob, \
							inputF=checkTwoVCFOverlapJob.perSampleFile, \
							parentJobLs=[checkTwoVCFOverlapJob], extraDependentInputLs=[])
		sys.stderr.write("%s pairs of VCF files, %s jobs.\n"%(noOfPairs, self.no_of_jobs))
		return returnData

示例#3

显示文件

文件： InspectAlignmentPipeline.py 项目： mjmontague/vervet-web

	def reduce(self, workflow=None, passingData=None, reduceAfterEachAlignmentDataLs=None,
			transferOutput=True, **keywords):
		"""
		2013.08.14 add 2DB jobs only when their input is not empty
		2012.9.17
		"""
		returnData = PassingData(no_of_jobs = 0)
		returnData.jobDataLs = []
		returnData.reduceAfterEachAlignmentDataLs = reduceAfterEachAlignmentDataLs
		
		reduceOutputDirJob = passingData.reduceOutputDirJob
		
		if passingData.flagStatOutputMergeJob.inputLs:	#the merge job's input is not empty or None
			flagStat2DBLogFile = File(os.path.join(reduceOutputDirJob.output, "flagStat2DB.log"))
			flagStat2DBJob = self.addPutStuffIntoDBJob(workflow, executable=self.PutFlagstatOutput2DB, \
						inputFileList=[passingData.flagStatOutputMergeJob.output], \
						logFile=flagStat2DBLogFile, commit=True, \
						parentJobLs=[reduceOutputDirJob, passingData.flagStatOutputMergeJob], \
						extraDependentInputLs=[], transferOutput=True, extraArguments=None, \
						job_max_memory=10, sshDBTunnel=self.needSSHDBTunnel)
		if passingData.depthOfCoverageOutputMergeJob.inputLs:
			DOC2DBLogFile = File(os.path.join(reduceOutputDirJob.output, "DOC2DB.log"))
			DOC2DBJob = self.addPutStuffIntoDBJob(workflow, executable=self.PutDOCOutput2DB, \
					inputFileList=[passingData.depthOfCoverageOutputMergeJob.output], \
					logFile=DOC2DBLogFile, commit=True, \
					parentJobLs=[reduceOutputDirJob, passingData.depthOfCoverageOutputMergeJob], \
					extraDependentInputLs=[], transferOutput=True, extraArguments=None, \
					job_max_memory=10, sshDBTunnel=self.needSSHDBTunnel)
		if self.alignmentDepthJobDataList and self.alignmentDepthIntervalMethodShortName:
			if not self.min_segment_length:
				sys.stderr.write("alignmentDepthIntervalMethodShortName=%s is given but min_segment_length (%s) is not.\n"%\
								(self.alignmentDepthIntervalMethodShortName, self.min_segment_length))
				sys.exit(4)
			#2013.08.16
			alignmentIDList = [pdata.alignment.id for pdata in self.alignmentDepthJobDataList]
			alignmentIDListInStr = utils.getSuccinctStrOutOfList(alignmentIDList)
			#job to add an AlignmentDepthIntervalMethod
			logFile = File(os.path.join(self.logOutputDirJob.output, 'AddAlignmentDepthIntervalMethod2DB.log'))
			addMethod2DBJob = self.addGenericFile2DBJob(executable=self.AddAlignmentDepthIntervalMethod2DB, \
					inputFile=None, inputArgumentOption="-i", \
					outputFile=None, outputArgumentOption="-o", \
					data_dir=self.data_dir, logFile=logFile, commit=True,\
					parentJobLs=[self.logOutputDirJob], extraDependentInputLs=None, extraOutputLs=None, \
					transferOutput=True, extraArguments=None, \
					extraArgumentList=["--methodShortName %s"%(self.alignmentDepthIntervalMethodShortName), \
									"--alignmentIDList %s"%(alignmentIDListInStr),\
									"--min_segment_length %s"%(self.min_segment_length)], \
					job_max_memory=2000, walltime=30,  sshDBTunnel=self.needSSHDBTunnel)
			
			logFile = File(os.path.join(self.logOutputDirJob.output, 'updateMethodNoOfIntervals.log'))
			updateMethodNoOfIntervalsJob = self.addGenericFile2DBJob(executable=self.UpdateAlignmentDepthIntervalMethodNoOfIntervals, \
					data_dir=self.data_dir, logFile=logFile, commit=True,\
					parentJobLs=[self.logOutputDirJob], extraDependentInputLs=None, extraOutputLs=None, \
					transferOutput=True, extraArguments=None, \
					extraArgumentList=["--methodShortName %s"%(self.alignmentDepthIntervalMethodShortName) ], \
					job_max_memory=2000, walltime=30, sshDBTunnel=self.needSSHDBTunnel)
			
			for chromosome, chromosomeSize in self.chr2size.iteritems():
				#add a ReduceSameChromosomeAlignmentDepthFiles job
				outputFile = File(os.path.join(reduceOutputDirJob.output, '%s_alignments_chr_%s_depth.tsv.gz'%(len(self.alignmentDepthJobDataList), chromosome)))
				reduceSameChromosomeAlignmentDepthFilesJob = self.addGenericJob(executable=self.ReduceSameChromosomeAlignmentDepthFiles, \
									inputFile=None, outputFile=outputFile, \
									parentJobLs=[reduceOutputDirJob], extraDependentInputLs=None, \
									extraArgumentList=["-w 2 --chromosomePositionColumnIndex 1 --chromosomeSize %s"%(chromosomeSize)], extraOutputLs=None,\
									transferOutput=False, \
									key2ObjectForJob=None, job_max_memory=2000, walltime=60)
				for alignmentDepthJobData in self.alignmentDepthJobDataList:
					#add a chromosome selection job
					outputFile = File(os.path.join(passingData.topOutputDirJob.output, \
												'%s_chr_%s.tsv.gz'%(utils.getFileBasenamePrefixFromPath(alignmentDepthJobData.file.name), chromosome)))
					selectRowsFromMatrixCCJob = self.addGenericJob(executable=self.SelectRowsFromMatrixCC, \
									inputFile=alignmentDepthJobData.file, outputFile=outputFile, \
									parentJobLs=alignmentDepthJobData.jobLs + [passingData.topOutputDirJob], extraDependentInputLs=None, \
									extraArgumentList=["--inputFileSortMode 1 -w 0 --whichColumnValue %s"%(chromosome)], extraOutputLs=None,\
									transferOutput=False, \
									key2ObjectForJob=None, job_max_memory=1000, walltime=60)
					self.addInputToStatMergeJob(statMergeJob=reduceSameChromosomeAlignmentDepthFilesJob, inputF=selectRowsFromMatrixCCJob.output, \
											inputArgumentOption="-i", parentJobLs=[selectRowsFromMatrixCCJob], \
											extraDependentInputLs=None)
				#add GADA job
				# add segmentation jobs to figure out intervals at similar
				outputFile = File(os.path.join(reduceOutputDirJob.output, '%s_alignments_%s_depth_GADAOut_minSegLength%s.tsv.gz'%\
											(len(self.alignmentDepthJobDataList), chromosome, self.min_segment_length)))
				#adjust memory based on chromosome size, 135Mb => 21.4g memory
				realInputVolume = chromosomeSize
				jobWalltime = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
									baseInputVolume=60000000, baseJobPropertyValue=600, \
									minJobPropertyValue=60, maxJobPropertyValue=2400).value
				#base is 135M, => 21G
				jobMaxMemory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
									baseInputVolume=135000000, baseJobPropertyValue=25000, \
									minJobPropertyValue=11000, maxJobPropertyValue=29000).value
				GADAJob = self.addGenericJob(executable=self.GADA, \
									inputFile=reduceSameChromosomeAlignmentDepthFilesJob.output, outputFile=outputFile, \
									parentJobLs=[reduceOutputDirJob, reduceSameChromosomeAlignmentDepthFilesJob], extraDependentInputLs=None, \
									extraArgumentList=["--MinSegLen %s"%(self.min_segment_length), '--debug -T 10 -a 0.5'], extraOutputLs=None,\
									transferOutput=False, \
									key2ObjectForJob=None, job_max_memory=jobMaxMemory, walltime=jobWalltime)
				"""
				GADAJob = self.addGenericJob(executable=self.GADA, \
									inputFile=reduceSameChromosomeAlignmentDepthFilesJob.output, outputFile=outputFile, \
									parentJobLs=[reduceOutputDirJob, reduceSameChromosomeAlignmentDepthFilesJob], extraDependentInputLs=None, \
									extraArgumentList=["-M %s"%(self.min_segment_length)], extraOutputLs=None,\
									transferOutput=False, \
									key2ObjectForJob=None, job_max_memory=10000, walltime=200)
				"""
				#job that adds AlignmentDepthIntervalFile
				logFile = File(os.path.join(self.logOutputDirJob.output, 'AddAlignmentDepthIntervalFile2DB_chr_%s.log'%(chromosome)))
				addFile2DBJob = self.addGenericFile2DBJob(executable=self.AddAlignmentDepthIntervalFile2DB, \
					inputFile=GADAJob.output, \
					inputArgumentOption="-i", \
					inputFileList=None, argumentForEachFileInInputFileList=None,\
					outputFile=None, outputArgumentOption="-o", \
					data_dir=self.data_dir, logFile=logFile, commit=True,\
					parentJobLs=[GADAJob, addMethod2DBJob, self.logOutputDirJob], \
					extraDependentInputLs=None, extraOutputLs=None, transferOutput=True, \
					extraArguments=None, \
					extraArgumentList=["--methodShortName %s"%(self.alignmentDepthIntervalMethodShortName), \
									"--alignmentIDList %s"%(alignmentIDListInStr), '--chromosome %s'%(chromosome),\
									"--format tsv"], \
					job_max_memory=2000, walltime=30, sshDBTunnel=self.needSSHDBTunnel)
				workflow.depends(parent=addFile2DBJob, child=updateMethodNoOfIntervalsJob)
		sys.stderr.write(" %s jobs, %s alignments with depth jobs, %s alignments with flagstat jobs.\n"%(self.no_of_jobs, \
							self.no_of_alns_with_depth_jobs, self.no_of_alns_with_flagstat_jobs))
		return returnData

示例#4

显示文件

文件： BeagleAndTrioCallerOnVCFWorkflow.py 项目： mjmontague/vervet-web

	def preReduce(self, workflow=None, outputDirPrefix="", passingData=None, transferOutput=True, **keywords):
		"""
		2013.05.01
			1. a job that outputs the pedigree from db, with members from the VCF file. used by various filter programs and TrioCaller
			2. a job that extracts the high-coverage individuals from the VCF file
			3. figure out the existence of beagle unrelated cohort, trio cohort, pair/duo cohort for high-coverage group and all individuals
				need the pedigree graph, a VCF file => all sample IDs and only high-coverage individuals
				
		"""
		returnData = AbstractVervetWorkflow.preReduce(self, workflow=workflow, outputDirPrefix=outputDirPrefix, \
													passingData=passingData, transferOutput=transferOutput, **keywords)
		
		self.statDirJob = self.addMkDirJob(outputDir="%sStat"%(outputDirPrefix))
		self.highCoveragePanelDirJob = self.addMkDirJob(outputDir="%sHighCoveragePanel"%(outputDirPrefix))
		self.auxDirJob = self.addMkDirJob(outputDir="%sAuxilliary"%(outputDirPrefix))
		
		self.beagleReduceDirJob = self.addMkDirJob(outputDir="%sReduceBeagle"%(outputDirPrefix))
		# self.reduceOutputDirJob would contain non-replicate VCF files
		#this folder would store all the reduced VCF files with replicates among samles. 
		self.replicateVCFDirJob = self.addMkDirJob(outputDir="%sReplicateVCF"%(outputDirPrefix))
		
		self.pedigreeKinshipFile = self.registerOneInputFile(inputFname=self.pedigreeKinshipFilePath, \
										folderName='aux')
		
		inputFileBasenamePrefix = utils.getFileBasenamePrefixFromPath(self.firstVCFJobData.file.name)
		# output pedigree to get pedigree file (for TrioCaller etc. that requires pedigree to be split into trios/duos) and sampleID2FamilyCountF 
		#		(for ReplicateVCFGenotypeColumns job, setting TrioCaller up)
		pedigreeFileFormat = 2
		pedFile = File(os.path.join(self.auxDirJob.output, 'pedigree.replicates.%s.format%s.txt'%\
								(inputFileBasenamePrefix, pedigreeFileFormat)))
		sampleID2FamilyCountF = File(os.path.join(self.auxDirJob.output, 'pedigree.replicates.sampleID2FamilyCount.%s.format%s.txt'%\
												(inputFileBasenamePrefix, pedigreeFileFormat)))
		self.outputReplicatePedigreeJob = self.addOutputVRCPedigreeInTFAMGivenOrderFromFileJob(executable=self.OutputVRCPedigreeInTFAMGivenOrderFromFile, \
				inputFile=self.firstVCFJobData.file, outputFile=pedFile, \
				sampleID2FamilyCountF=sampleID2FamilyCountF,\
				polymuttDatFile = None,\
				outputFileFormat=pedigreeFileFormat, \
				replicateIndividualTag=self.replicateIndividualTag,\
				treatEveryOneIndependent=self.treatEveryOneIndependent,\
				parentJobLs=self.firstVCFJobData.jobLs + [self.auxDirJob], \
				extraDependentInputLs=None, transferOutput=True, \
				extraArguments=None, job_max_memory=2000, sshDBTunnel=self.needSSHDBTunnel)
		
		#output pedigree, with no replicating certain individuals, no trio/duo splitting
		pedigreeFileFormat = 4
		pedFile = File(os.path.join(self.auxDirJob.output, 'pedigree.%s.format%s.txt'%\
								(inputFileBasenamePrefix, pedigreeFileFormat)))
		#sampleID2FamilyCountF = File(os.path.join(self.auxDirJob.output, 'pedigree.sampleID2FamilyCount.%s.format%s.txt'%\
		#						(inputFileBasenamePrefix, pedigreeFileFormat)))
		self.outputPedigreeJob = self.addOutputVRCPedigreeInTFAMGivenOrderFromFileJob(executable=self.OutputVRCPedigreeInTFAMGivenOrderFromFile, \
				inputFile=self.firstVCFJobData.file, outputFile=pedFile, \
				sampleID2FamilyCountF=None,\
				polymuttDatFile = None,\
				outputFileFormat=pedigreeFileFormat, \
				replicateIndividualTag=self.replicateIndividualTag,\
				treatEveryOneIndependent=self.treatEveryOneIndependent,\
				parentJobLs=self.firstVCFJobData.jobLs + [self.auxDirJob], \
				extraDependentInputLs=None, transferOutput=True, \
				extraArguments=None, job_max_memory=2000, sshDBTunnel=self.needSSHDBTunnel)
		
		#ExtractSamplesFromVCF samples with coverage >=min_coverage
		# the input VCF does not contain replicates.
		outputFile = File(os.path.join(self.auxDirJob.output, '%s.minCoverage%s.sampleIDList.tsv'%\
									(inputFileBasenamePrefix, self.minCoverageForRefPanel)))
		extractRefPanelSampleIDJob = self.addExtractSampleIDJob(inputFile=self.firstVCFJobData.file, \
							outputFile=outputFile,\
							min_coverage=self.minCoverageForRefPanel, outputFormat=3,\
							returnData=returnData,\
							transferOutput=True, \
							parentJobLs=[self.firstVCFJobData.jobLs, self.auxDirJob])
		self.extractRefPanelSampleIDJob = extractRefPanelSampleIDJob
		
		
		# GATK SelectVariants: select High-coverage individuals out into a new VCF
		#	selectVariants would re-generate AC, AF so that TrioCaller could read it.
		#	samtools uses 'AC1' instead of AC, 'AF1' instead of AF.
		#		?can it deal with Platypus output, which does not have AC/AF/DP?
		# selectHighCoverageSampleJob is needed here because a VCF file of high-coverage members is needed
		# 	for outputPedigreeOfHghCoverageSamplesJob
		#
		highCoverageSampleVCF = File(os.path.join(self.auxDirJob.output, '%s.minCoverage%s.vcf'%\
												(inputFileBasenamePrefix, self.minCoverageForRefPanel)))
		selectHighCoverageSampleJob = self.addSelectVariantsJob(SelectVariantsJava=self.SelectVariantsJava, \
				inputF=self.firstVCFJobData.file, \
				outputF=highCoverageSampleVCF, \
				refFastaFList=self.registerReferenceData.refFastaFList, \
				sampleIDKeepFile=self.extractRefPanelSampleIDJob.output,\
				parentJobLs=[self.auxDirJob, self.extractRefPanelSampleIDJob]+self.firstVCFJobData.jobLs, \
				extraDependentInputLs=[self.firstVCFJobData.tbi_F], transferOutput=transferOutput, \
				extraArguments=None, job_max_memory=2000)
		
		# output a plink pedigree that contains these HC members only
		# output pedigree to get pedigree file (for GATK, TrioCaller, own programs) and sampleID2FamilyCountF (for ReplicateVCFGenotypeColumns job)
		# find a way to cache this job (used for same set of samples, but different chromosome intervals)
		pedigreeFileFormat = 4
		pedFile = File(os.path.join(self.auxDirJob.output, 'pedigree.minCoverage%s.%s.format%s.txt'%\
								(self.minCoverageForRefPanel, inputFileBasenamePrefix, pedigreeFileFormat)))
		#sampleID2FamilyCountF = File(os.path.join(self.auxDirJob.output, 'pedigree.minCoverage%s.sampleID2FamilyCount.%s.format%s.txt'%\
		#									(self.minCoverageForRefPanel, inputFileBasenamePrefix, pedigreeFileFormat)))
		self.outputPedigreeOfHghCoverageSamplesJob = self.addOutputVRCPedigreeInTFAMGivenOrderFromFileJob(executable=self.OutputVRCPedigreeInTFAMGivenOrderFromFile, \
				inputFile=selectHighCoverageSampleJob.output, outputFile=pedFile, \
				sampleID2FamilyCountF=None,\
				polymuttDatFile = None,\
				outputFileFormat=pedigreeFileFormat, replicateIndividualTag=self.replicateIndividualTag,\
				treatEveryOneIndependent=self.treatEveryOneIndependent,\
				parentJobLs=[self.auxDirJob, selectHighCoverageSampleJob], \
				extraDependentInputLs=[], transferOutput=True, \
				extraArguments=None, job_max_memory=2000, sshDBTunnel=self.needSSHDBTunnel)
		
		#a job that outputs alignment coverage (alignment.read_group, median_depth)
		alignmentDepthFile = File(os.path.join(self.auxDirJob.folder, '%s.alignmentDepth.tsv'%(inputFileBasenamePrefix)))
		self.outputAlignmentDepthJob = self.addOutputVCFAlignmentDepthRangeJob(executable=self.OutputVCFAlignmentDepthRange, \
						inputFile=self.firstVCFJobData.file, \
						ref_ind_seq_id=self.ref_ind_seq_id, depthFoldChange=None, minGQ=None,\
						outputFile=alignmentDepthFile, outputFileFormat=1,\
						extraArgumentList=None,\
						parentJobLs=[self.auxDirJob]+self.firstVCFJobData.jobLs, \
						extraDependentInputLs=None, transferOutput=True, \
						job_max_memory=2000, sshDBTunnel=self.needSSHDBTunnel)
		
		
		#a SelectDistantMembersFromGenotypeFile.py job to generate a ref panel for 2nd-round beagle
		# need the pedigree file
		# produces a list of samples
		phasedRefPanelSampleListFile = File(os.path.join(self.auxDirJob.folder, '%s.RefPanel.sampleList.maxPairwiseKinship%s.tsv'%\
														(inputFileBasenamePrefix, self.maxPairwiseKinship)))
		self.selectDistantMembersFromGenotypeFileJob = self.addGenericJob(executable=self.SelectDistantMembersFromGenotypeFile, \
						inputFile=selectHighCoverageSampleJob.output,
						outputFile=phasedRefPanelSampleListFile, outputArgumentOption="-o", \
						extraDependentInputLs=[self.pedigreeKinshipFile], \
						extraOutputLs=None, transferOutput=False, frontArgumentList=None, \
						extraArguments=None, \
						extraArgumentList=["--maxPairwiseKinship %s"%(self.maxPairwiseKinship), "--sampleSize 90", \
							"--pedigreeKinshipFile", self.pedigreeKinshipFile, \
							"--replicateIndividualTag", self.replicateIndividualTag,\
							"--individualAlignmentCoverageFname", self.outputAlignmentDepthJob.output, \
							"--pedigreeFname", self.outputPedigreeJob.output], \
						parentJobLs=[selectHighCoverageSampleJob, self.outputAlignmentDepthJob,  self.outputPedigreeJob,\
									self.auxDirJob],\
						no_of_cpus=None, job_max_memory = 4000, walltime= 120)
		
		"""
		
		#analyze the pedigree graph to figure out singletons, trios, duos
		self.alignmentLs = self.db.getAlignmentsFromVCFFile(inputFname=yh_pegasus.getAbsPathOutOfFile(self.firstVCFJobData.file))
		#2013.06.14 approach below does not work because pedigree of extracting-high-coverage + replication is different from that of replication + extracting-high-coverage (=reality).
		# some replicates might end up as singletons in the latter, while not so in the former.
		#
		self.highCoverageAlignmentLs = self.db.filterAlignments(alignmentLs=self.alignmentLs, min_coverage=self.minCoverageForRefPanel, \
			max_coverage=None, individual_site_id=None, \
			sequence_filtered=None, individual_site_id_set=None, \
			mask_genotype_method_id=None, parent_individual_alignment_id=None,\
			country_id_set=None, tax_id_set=None, excludeContaminant=False, excludeTissueIDSet=None,\
			local_realigned=None, reduce_reads=None, report=False)
		
		"""
		
		#a stat merge job (keeping track of how many mendel error sites were filtered)
		filterByRemoveMendelErrorSiteStatMergeFile = File(os.path.join(self.statDirJob.folder, 'filterByRemoveMendelErrorSiteStatMerge.tsv'))
		self.filterByRemoveMendelErrorSiteStatMergeJob = self.addStatMergeJob(statMergeProgram=workflow.ReduceMatrixByChosenColumn, \
								outputF=filterByRemoveMendelErrorSiteStatMergeFile, \
								transferOutput=False, parentJobLs=[self.statDirJob],\
								extraArguments="--keyColumnLs 1 --valueColumnLs 2-4")	#column 1 is the chromosome length, which are set to be all same.
								#column 2-4 are #sitesInInput1, #sitesInInput2, #overlapping
		returnData.jobDataLs.append(PassingData(jobLs=[self.filterByRemoveMendelErrorSiteStatMergeJob], \
											fileLs=[self.filterByRemoveMendelErrorSiteStatMergeJob.output]))
		#concordance stat reduce jobs
		#reduce the replicate concordance results from before TrioCaller (after beagle phasing)
		#
		"""
		outputFile = File(os.path.join(self.statDirJob.folder, 'beaglePhaseReplicateConcordance.allSites.tsv'))
		reduceBeaglePhaseReplicateConcordanceJob_AllSites = self.addStatMergeJob(statMergeProgram=self.ReduceMatrixBySumSameKeyColsAndThenDivide, \
							outputF=outputFile, \
							extraArguments='--keyColumnLs 0,1 --valueColumnLs 2,3', transferOutput=False)
		outputFile = File(os.path.join(self.statDirJob.folder, 'beaglePhaseReplicateConcordance.h**o.tsv'))
		reduceBeaglePhaseReplicateConcordanceJob_HomoOnly = self.addStatMergeJob(statMergeProgram=self.ReduceMatrixBySumSameKeyColsAndThenDivide, \
							outputF=outputFile, \
							extraArguments='--keyColumnLs 0,1 --valueColumnLs 5,6', transferOutput=False)
		outputFile = File(os.path.join(self.statDirJob.folder, 'beaglePhaseReplicateConcordance.tsv'))
		concatenateTwoBeaglePhaseConcordanceResultJob = self.addStatMergeJob(statMergeProgram=self.ReduceMatrixByMergeColumnsWithSameKey, \
							outputF=outputFile, \
							extraArguments='--keyColumnLs 0,1 --valueColumnLs 2,3,4', transferOutput=False)
		self.addInputToStatMergeJob(statMergeJob=concatenateTwoBeaglePhaseConcordanceResultJob, \
							parentJobLs=[reduceBeaglePhaseReplicateConcordanceJob_AllSites])
		self.addInputToStatMergeJob(statMergeJob=concatenateTwoBeaglePhaseConcordanceResultJob, \
							parentJobLs=[reduceBeaglePhaseReplicateConcordanceJob_HomoOnly])
		returnData.jobDataLs.append(PassingData(jobLs=[concatenateTwoBeaglePhaseConcordanceResultJob], \
											fileLs=[concatenateTwoBeaglePhaseConcordanceResultJob.output]))
		#pass to self, as they will be used in reduceEachVCF()
		self.reduceBeaglePhaseReplicateConcordanceJob_AllSites = reduceBeaglePhaseReplicateConcordanceJob_AllSites
		self.reduceBeaglePhaseReplicateConcordanceJob_HomoOnly = reduceBeaglePhaseReplicateConcordanceJob_HomoOnly
		"""
		
		#reduce replicate concordance results from after-TrioCaller VCFs 
		outputFile = File(os.path.join(self.statDirJob.folder, 'trioCallerReplicateConcordance.allSites.tsv'))
		reduceTrioCallerReplicateConcordanceJob_AllSites = self.addStatMergeJob(statMergeProgram=self.ReduceMatrixBySumSameKeyColsAndThenDivide, \
							outputF=outputFile, \
							extraArguments='--keyColumnLs 0,1 --valueColumnLs 2,3', transferOutput=False)
		outputFile = File(os.path.join(self.statDirJob.folder, 'trioCallerReplicateConcordance.h**o.tsv'))
		reduceTrioCallerReplicateConcordanceJob_HomoOnly = self.addStatMergeJob(statMergeProgram=self.ReduceMatrixBySumSameKeyColsAndThenDivide, \
							outputF=outputFile, \
							extraArguments='--keyColumnLs 0,1 --valueColumnLs 5,6', transferOutput=False)
		
		outputFile = File(os.path.join(self.statDirJob.folder, 'trioCallerReplicateConcordance.tsv'))
		concatenateTwoTrioCallerConcordanceResultJob = self.addStatMergeJob(statMergeProgram=self.ReduceMatrixByMergeColumnsWithSameKey, \
							outputF=outputFile, \
							extraArguments='--keyColumnLs 0,1 --valueColumnLs 2,3,4', transferOutput=False)
		
		self.addInputToStatMergeJob(statMergeJob=concatenateTwoTrioCallerConcordanceResultJob, \
							parentJobLs=[reduceTrioCallerReplicateConcordanceJob_AllSites])
		self.addInputToStatMergeJob(statMergeJob=concatenateTwoTrioCallerConcordanceResultJob, \
							parentJobLs=[reduceTrioCallerReplicateConcordanceJob_HomoOnly])
		returnData.jobDataLs.append(PassingData(jobLs=[concatenateTwoTrioCallerConcordanceResultJob], \
											fileLs=[concatenateTwoTrioCallerConcordanceResultJob.output]))
		#pass to self, as they will be used in reduceEachVCF()
		self.reduceTrioCallerReplicateConcordanceJob_AllSites = reduceTrioCallerReplicateConcordanceJob_AllSites
		self.reduceTrioCallerReplicateConcordanceJob_HomoOnly = reduceTrioCallerReplicateConcordanceJob_HomoOnly
		
		return returnData

示例#5

显示文件

文件： AlignmentToTrioCallPipeline.py 项目： mjmontague/vervet-web

	def addRefineGenotypeJobsViaBeagle(self, inputFile=None, vcfBaseFname=None, outputDirJob=None, statDirJob=None, \
					refFastaFList=None, intervalData=None,\
					baseInputVolume=450*2000000, realInputVolume=None,\
					parentJobLs=None, \
					transferOutput=False, \
					no_of_cpus=None, job_max_memory=2000, walltime=180, \
					max_walltime=None, **keywords):
		
		returnData = PassingData()
		
		if not hasattr(self, "outputPedigreeJob"):
			#output pedigree, with no replicating certain individuals, no trio/duo splitting
			#plink format
			#for Beagle to read in
			pedigreeFileFormat = 4
			inputFileBasenamePrefix = utils.getFileBasenamePrefixFromPath(inputFile.name)
			pedFile = File(os.path.join(outputDirJob.output, 'pedigree.%s.format%s.txt'%\
									(inputFileBasenamePrefix, pedigreeFileFormat)))
			#sampleID2FamilyCountF = File(os.path.join(self.auxDirJob.output, 'pedigree.sampleID2FamilyCount.%s.format%s.txt'%\
			#						(inputFileBasenamePrefix, pedigreeFileFormat)))
			self.outputPedigreeJob = self.addOutputVRCPedigreeInTFAMGivenOrderFromFileJob(executable=self.OutputVRCPedigreeInTFAMGivenOrderFromFile, \
					inputFile=inputFile, outputFile=pedFile, \
					sampleID2FamilyCountF=None,\
					polymuttDatFile = None,\
					outputFileFormat=pedigreeFileFormat, \
					replicateIndividualTag=self.replicateIndividualTag,\
					treatEveryOneIndependent=self.treatEveryOneIndependent,\
					parentJobLs=parentJobLs + [outputDirJob], \
					extraDependentInputLs=None, transferOutput=True, \
					extraArguments=None, job_max_memory=2000, sshDBTunnel=self.needSSHDBTunnel)
			
		##### Part 2 run Beagle on everyone with reference panel
		# run Beagle
		#refPanelFile=selectDistantMembersVariantsJob.output,\
		outputFnamePrefix = os.path.join(outputDirJob.folder, '%s.beagled'%(vcfBaseFname))
		beagleJob = self.addBeagle4Job(executable=self.BeagleJava, \
						inputFile=inputFile, refPanelFile=None,\
						pedFile=self.outputPedigreeJob.output,\
						outputFnamePrefix=outputFnamePrefix, \
						burninIterations=7, phaseIterations=10, \
						noOfSamplingHaplotypesPerSample=4, duoscale=2, trioscale=2, \
						extraArguments=None, extraArgumentList=None,\
						parentJobLs=[outputDirJob, \
									self.outputPedigreeJob] + parentJobLs, \
						transferOutput=False, no_of_cpus=None, \
						job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \
							minJobPropertyValue=4000, maxJobPropertyValue=13000).value,\
						walltime= self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \
							minJobPropertyValue=60, maxJobPropertyValue=1200).value,\
						)
		returnData.beagleJob = beagleJob
		
		#index .vcf.gz, output of beagle, without index, GATK can't work on gzipped vcf
		tabixIndexFile = File('%s.tbi'%(beagleJob.output.name))
		tabixJob = self.addGenericJob(executable=self.tabix, \
						inputFile=beagleJob.output, inputArgumentOption="",\
						outputFile=None, outputArgumentOption="-o", \
						extraDependentInputLs=None, \
						extraOutputLs=[beagleJob.output, tabixIndexFile], transferOutput=False, \
						frontArgumentList=["-p vcf"], \
						extraArguments=None, \
						extraArgumentList=None, \
						parentJobLs=[beagleJob, outputDirJob],\
						no_of_cpus=None, \
					job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \
							minJobPropertyValue=2000, maxJobPropertyValue=4000).value,\
					walltime= self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \
							minJobPropertyValue=60, maxJobPropertyValue=180).value)
		
		#borrow PL to from pre-Beagle VCF to genotype 
		outputFile = File(os.path.join(outputDirJob.folder, '%s.beagled.withPL.vcf'%(vcfBaseFname)))
		combineBeagleAndPreBeagleVariantsJob = self.addGATKJob(executable=self.CombineBeagleAndPreBeagleVariantsJava, \
					GenomeAnalysisTKJar=self.GenomeAnalysisTKJar, \
					GATKAnalysisType="CombineBeagleAndPreBeagleVariants",\
					inputFile=None, inputArgumentOption=None, \
					refFastaFList=refFastaFList, \
					inputFileList=None, argumentForEachFileInInputFileList="--variant",\
					interval=None, outputFile=outputFile, outputArgumentOption="--out", \
					frontArgumentList=None, extraArguments=None, \
					extraArgumentList=["--variant:first", beagleJob.output, "--variant:second", inputFile, \
								"-genotypeMergeOptions PRIORITIZE", "-priority first,second"], \
					extraOutputLs=None, \
					extraDependentInputLs=[inputFile] + tabixJob.outputLs, \
					parentJobLs=[beagleJob, tabixJob]+ parentJobLs, transferOutput=False, \
					no_of_cpus=None, \
					key2ObjectForJob=None,\
					job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \
							minJobPropertyValue=2000, maxJobPropertyValue=4000).value,\
					walltime= self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \
							minJobPropertyValue=60, maxJobPropertyValue=600).value)
		#do not use "--variant:beagle" to name your vcf file as GATK would think it's in Beagle format
		returnData.refineGenotypeJob = combineBeagleAndPreBeagleVariantsJob	#the final gentoype job
		returnData.refineGenotypeJob.intervalData = intervalData	#attached so that it could be used by downstream jobs
		return returnData