def addPickleGenomeRBDictJob(self, workflow, executable=None, \ outputF=None, genePadding=None, tax_id=3702,\ parentJobLs=[], job_max_memory=100, extraDependentInputLs=[], \ transferOutput=False, **keywords): """ 2012.3.22 """ job = Job(namespace=workflow.namespace, name=executable.name, version=workflow.version) job.addArguments("-v", self.genome_drivername, "-z", self.genome_hostname, "-d", self.genome_dbname, \ "-u", self.genome_db_user, "-p", self.genome_db_passwd,\ "--genePadding=%s"%(genePadding), "--tax_id=%s"%(tax_id), "-o", outputF) if self.genome_schema: job.addArguments("--schema=%s" % self.genome_schema) job.uses(outputF, transfer=transferOutput, register=True, link=Link.OUTPUT) job.output = outputF yh_pegasus.setJobProperRequirement(job, job_max_memory=job_max_memory) workflow.addJob(job) for input in extraDependentInputLs: job.uses(input, transfer=True, register=True, link=Link.INPUT) for parentJob in parentJobLs: workflow.depends(parent=parentJob, child=job) return job
def addOutputLociIDOfResultPeakInHDF5Job(self, workflow, executable=None, peak_id=None, outputFile=None,\ parentJobLs=[], extraDependentInputLs=[], transferOutput=True, extraArguments=None, \ job_max_memory=10, **keywords): """ 2012.3.10 -i 59444 -u yh -z banyan -o /tmp/peak_59444.h5 """ job = Job(namespace=workflow.namespace, name=executable.name, version=workflow.version) job.addArguments("-v", self.drivername, "-z", self.hostname, "-d", self.dbname, \ "-u", self.db_user, "-p", self.db_passwd,\ "-i", repr(int(peak_id)), "-o", outputFile) if extraArguments: job.addArguments(extraArguments) job.uses(outputFile, transfer=transferOutput, register=True, link=Link.OUTPUT) job.output = outputFile workflow.addJob(job) yh_pegasus.setJobProperRequirement(job, job_max_memory=job_max_memory) for parentJob in parentJobLs: workflow.depends(parent=parentJob, child=job) for input in extraDependentInputLs: job.uses(input, transfer=True, register=True, link=Link.INPUT) return job
def addSmartpcaJob( self, workflow, executable=None, smartpcaParameterFile=None, parentJobLs=[], extraDependentInputLs=[], transferOutput=True, extraArguments=None, outputFileList=[], job_max_memory=100, **keywords ): """ 2012.3.1 """ job = Job(namespace=workflow.namespace, name=executable.name, version=workflow.version) job.addArguments("-p", smartpcaParameterFile) job.uses(smartpcaParameterFile, transfer=True, register=True, link=Link.INPUT) if extraArguments: job.addArguments(extraArguments) for outputF in outputFileList: if outputF: job.uses(outputF, transfer=transferOutput, register=True, link=Link.OUTPUT) workflow.addJob(job) yh_pegasus.setJobProperRequirement(job, job_max_memory=job_max_memory) for parentJob in parentJobLs: workflow.depends(parent=parentJob, child=job) for input in extraDependentInputLs: job.uses(input, transfer=True, register=True, link=Link.INPUT) return job
def addConvertSNPData2HDF5Job(self, workflow, executable=None, inputFile=None, outputFile=None, min_MAF=None, \ parentJobLs=[], extraDependentInputLs=[], transferOutput=True, extraArguments=None, \ job_max_memory=100, **keywords): """ 2012.3.2 """ job = Job(namespace=workflow.namespace, name=executable.name, version=workflow.version) job.addArguments('-i', inputFile, '-o', outputFile, '-n %s' % (min_MAF)) job.uses(inputFile, transfer=True, register=True, link=Link.INPUT) if extraArguments: job.addArguments(extraArguments) job.uses(outputFile, transfer=transferOutput, register=True, link=Link.OUTPUT) job.output = outputFile workflow.addJob(job) yh_pegasus.setJobProperRequirement(job, job_max_memory=job_max_memory) for parentJob in parentJobLs: workflow.depends(parent=parentJob, child=job) for input in extraDependentInputLs: job.uses(input, transfer=True, register=True, link=Link.INPUT) return job
def addDrawManhattanPlotForLDInHDF5Job(self, workflow, executable=None, correlationFile=None, peak_id=None, \ datasetName=None, outputFile=None,\ outputFnamePrefix=None, parentJobLs=[], extraDependentInputLs=[], transferOutput=True, extraArguments=None, \ job_max_memory=10, **keywords): """ 2012.3.10 DrawManhattanPlotForLDInHDF5.py -w secret -i /tmp/output.2.h5 -l 59444 -N correlation -O /tmp/gw_LD_pattern_between_snp_and_peak_59444 -u yh -p secret """ job = Job(namespace=workflow.namespace, name=executable.name, version=workflow.version) job.addArguments("-v", self.drivername, "-z", self.hostname, "-d", self.dbname, \ "-u", self.db_user, "-p", self.db_passwd, "-w", self.genome_db_passwd,\ "-l %s"%(peak_id),"-N", datasetName, "-i", correlationFile, "-O", outputFnamePrefix) if extraArguments: job.addArguments(extraArguments) job.uses(correlationFile, transfer=True, register=True, link=Link.INPUT) job.uses(outputFile, transfer=transferOutput, register=True, link=Link.OUTPUT) job.output = outputFile workflow.addJob(job) yh_pegasus.setJobProperRequirement(job, job_max_memory=job_max_memory) for parentJob in parentJobLs: workflow.depends(parent=parentJob, child=job) for input in extraDependentInputLs: job.uses(input, transfer=True, register=True, link=Link.INPUT) return job
def addCalculateColCorBetweenTwoHDF5Job(self, workflow, executable=None, inputFile1=None, inputFile2=None, outputFile=None, \ i1_start=None, i1_stop=None, i2_start=None, i2_stop=None, min_cor=None, \ parentJobLs=[], extraDependentInputLs=[], transferOutput=True, extraArguments=None, \ job_max_memory=100, **keywords): """ 2012.3.2 """ job = Job(namespace=workflow.namespace, name=executable.name, version=workflow.version) job.addArguments('-o', outputFile,'-i', inputFile1, '-j', inputFile2, '-s %s'%i1_start, '-t %s'%i1_stop, \ '-u %s'%i2_start, '-v %s'%i2_stop, '-c %s'%min_cor) job.uses(inputFile1, transfer=True, register=True, link=Link.INPUT) job.uses(inputFile2, transfer=True, register=True, link=Link.INPUT) if extraArguments: job.addArguments(extraArguments) job.uses(outputFile, transfer=transferOutput, register=True, link=Link.OUTPUT) job.output = outputFile workflow.addJob(job) yh_pegasus.setJobProperRequirement(job, job_max_memory=job_max_memory) for parentJob in parentJobLs: workflow.depends(parent=parentJob, child=job) for input in extraDependentInputLs: job.uses(input, transfer=True, register=True, link=Link.INPUT) return job
def addPickleSNPInfoJob(self, workflow, executable=None, \ outputF=None, call_method_id=None, \ parentJobLs=[], job_max_memory=100, extraDependentInputLs=[], \ transferOutput=False, **keywords): """ 2012.3.22 """ job = Job(namespace=workflow.namespace, name=executable.name, version=workflow.version) job.addArguments("-v", self.drivername, "-z", self.hostname, "-d", self.dbname, \ "-u", self.db_user, "-p", self.db_passwd, \ "--call_method_id=%s"%(call_method_id), "-F", outputF) job.uses(outputF, transfer=transferOutput, register=True, link=Link.OUTPUT) job.output = outputF yh_pegasus.setJobProperRequirement(job, job_max_memory=job_max_memory) workflow.addJob(job) for input in extraDependentInputLs: job.uses(input, transfer=True, register=True, link=Link.INPUT) for parentJob in parentJobLs: workflow.depends(parent=parentJob, child=job) return job
def addGWASPeakOverlapJob(self, workflow, executable=None, \ result1_id=None, result2_id=None, association1_peak_type_id=None, \ association2_peak_type_id=None, peak_padding=None, outputF=None, \ commit=0, results_directory=None, logFile=None, \ parentJobLs=[], job_max_memory=100, walltime = 60, \ extraDependentInputLs=[], \ transferOutput=False, **keywords): """ 2012.2.22 walltime is in minutes (max time allowed on hoffman2 is 24 hours). """ job = Job(namespace=workflow.namespace, name=executable.name, version=workflow.version) #apply int because result1_id is of long int. and casting "434L" to integer will raise exception job.addArguments("-v", self.drivername, "-z", self.hostname, "-d", self.dbname, \ "-u", self.db_user, "-p", self.db_passwd,\ "-i", repr(int(result1_id)), "-j", repr(int(result2_id)), \ "-x", repr(association1_peak_type_id), "-y", repr(association2_peak_type_id), \ "-e", repr(peak_padding), "-o", outputF) job.uses(outputF, transfer=transferOutput, register=True, link=Link.OUTPUT) if commit: job.addArguments("-c") if results_directory: job.addArguments( "-t", results_directory, ) if self.schema: job.addArguments( "-k", self.schema, ) if logFile: job.addArguments("--logFilename=%s" % (logFile.name)) job.uses(logFile, transfer=transferOutput, register=transferOutput, link=Link.OUTPUT) job.output = logFile workflow.addJob(job) for input in extraDependentInputLs: job.uses(input, transfer=True, register=True, link=Link.INPUT) for parentJob in parentJobLs: workflow.depends(parent=parentJob, child=job) yh_pegasus.setJobProperRequirement(job, job_max_memory=job_max_memory, walltime=walltime) return job
def addPutReadBaseCountIntoDBJob( self, workflow, executable=None, inputFileLs=[], logFile=None, commit=False, parentJobLs=[], extraDependentInputLs=[], transferOutput=True, extraArguments=None, job_max_memory=10, sshDBTunnel=1, **keywords ): """ 2012.5.3 add argument sshDBTunnel 2012.3.14 """ job = Job(namespace=workflow.namespace, name=executable.name, version=workflow.version) job.addArguments( "--drivername", self.drivername, "--hostname", self.hostname, "--dbname", self.dbname, "--db_user", self.db_user, "--db_passwd", self.db_passwd, "--logFilename", logFile, ) if extraArguments: job.addArguments(extraArguments) if commit: job.addArguments("--commit") for inputFile in inputFileLs: job.addArguments(inputFile) job.uses(inputFile, transfer=True, register=True, link=Link.INPUT) job.uses(logFile, transfer=transferOutput, register=True, link=Link.OUTPUT) job.output = logFile workflow.addJob(job) yh_pegasus.setJobProperRequirement(job, job_max_memory=job_max_memory, sshDBTunnel=sshDBTunnel) for parentJob in parentJobLs: workflow.depends(parent=parentJob, child=job) for input in extraDependentInputLs: job.uses(input, transfer=True, register=True, link=Link.INPUT) return job
def addConvertVCF2EigenStratJob( self, workflow, executable=None, inputF=None, outputFnamePrefix=None, missingCallAsRefBase=None, parentJobLs=[], extraDependentInputLs=[], transferOutput=True, extraArguments=None, job_max_memory=100, **keywords ): """ 2012.9.11 add argument missingCallAsRefBase 2012.3.1 """ job = Job(namespace=workflow.namespace, name=executable.name, version=workflow.version) job.addArguments("-i", inputF, "-O", outputFnamePrefix) if missingCallAsRefBase: job.addArguments("--missingCallAsRefBase") if extraArguments: job.addArguments(extraArguments) job.uses(inputF, transfer=True, register=True, link=Link.INPUT) genoOutputF = File("%s.geno" % (outputFnamePrefix)) locusOutputF = File("%s.snp" % (outputFnamePrefix)) indOutputF = File("%s.ind" % (outputFnamePrefix)) outputFLs = [genoOutputF, locusOutputF, indOutputF] for outputF in outputFLs: job.uses(outputF, transfer=transferOutput, register=True, link=Link.OUTPUT) job.genoOutputF = genoOutputF job.indOutputF = indOutputF job.locusOutputF = locusOutputF workflow.addJob(job) yh_pegasus.setJobProperRequirement(job, job_max_memory=job_max_memory) if parentJobLs: for parentJob in parentJobLs: if parentJob: workflow.depends(parent=parentJob, child=job) if extraDependentInputLs: for input in extraDependentInputLs: if input: job.uses(input, transfer=True, register=True, link=Link.INPUT) return job
def addPlotPeakOverlapJob(self, workflow, executable=None, \ outputFnamePrefix=None, \ parentJobLs=[], job_max_memory=100, walltime = 60, \ extraDependentInputLs=[], \ transferOutput=False, **keywords): """ 2012.2.22 walltime is in minutes (max time allowed on hoffman2 is 24 hours). """ job = Job(namespace=workflow.namespace, name=executable.name, version=workflow.version) job.addArguments("-o", outputFnamePrefix) job.uses(File("%s_hist_of_fraction_of_association1_peaks_in_result2.png"%outputFnamePrefix), \ transfer=True, register=True, link=Link.OUTPUT) job.uses(File("%s_hist_of_fraction_of_association2_peaks_in_result1.png"%outputFnamePrefix), \ transfer=True, register=True, link=Link.OUTPUT) job.uses(File("%s_hist_of_fraction_of_recurrent_peaks_in_combined.png"%outputFnamePrefix), \ transfer=True, register=True, link=Link.OUTPUT) job.uses(File("%s_no_of_peaks_result1_vs_result2.png"%outputFnamePrefix), \ transfer=True, register=True, link=Link.OUTPUT) job.uses(File("%s_result1_no_of_peak_vs_fraction.png"%outputFnamePrefix), \ transfer=True, register=True, link=Link.OUTPUT) job.uses(File("%s_result2_no_of_peak_vs_fraction.png"%outputFnamePrefix), \ transfer=True, register=True, link=Link.OUTPUT) job.uses(File("%s_1_fraction_in2_vs_2_fraction_in1.png"%outputFnamePrefix), \ transfer=True, register=True, link=Link.OUTPUT) job.uses(File("%s_combined_no_of_peak_vs_fraction.png"%outputFnamePrefix), \ transfer=True, register=True, link=Link.OUTPUT) workflow.addJob(job) for input in extraDependentInputLs: job.uses(input, transfer=True, register=True, link=Link.INPUT) for parentJob in parentJobLs: workflow.depends(parent=parentJob, child=job) yh_pegasus.setJobProperRequirement(job, job_max_memory=job_max_memory, walltime=walltime) return job
def addOutputMultiGWASOverlapPeakSpanJob(self, workflow, executable=None, \ outputF=None, peakPadding=None,list_type_id_list=None, result_id_peak_type_id_ls=None,\ genePadding=None, tax_id=3702, genomeRBDictPickleFile=None, \ parentJobLs=[], job_max_memory=100, extraDependentInputLs=[], \ transferOutput=False, **keywords): """ 2012.3.22 argument list_type_id_list is in string format. "129,137" """ job = Job(namespace=workflow.namespace, name=executable.name, version=workflow.version) job.addArguments("-v", self.drivername, "-z", self.hostname, "-d", self.dbname, \ "-u", self.db_user, "-p", self.db_passwd, \ "--genome_drivername=%s"%self.genome_drivername, "--genome_hostname=%s"%self.genome_hostname, \ "--genome_dbname=%s"%self.genome_dbname, \ "--genome_db_user=%s"%self.genome_db_user, "--genome_schema=%s"%self.genome_schema, \ "--genome_db_passwd=%s"%self.genome_db_passwd,\ "--peakPadding=%s"%(peakPadding), \ "--result_id_peak_type_id_ls=%s"%(result_id_peak_type_id_ls), \ "--genePadding=%s"%(genePadding), \ "--tax_id=%s"%(tax_id), "-o", outputF) if list_type_id_list: job.addArguments("--list_type_id_list=%s" % (list_type_id_list)) if genomeRBDictPickleFile: job.addArguments("-m", genomeRBDictPickleFile) job.uses(genomeRBDictPickleFile, transfer=True, register=True, link=Link.INPUT) job.uses(outputF, transfer=transferOutput, register=True, link=Link.OUTPUT) job.output = outputF yh_pegasus.setJobProperRequirement(job, job_max_memory=job_max_memory) workflow.addJob(job) for input in extraDependentInputLs: job.uses(input, transfer=True, register=True, link=Link.INPUT) for parentJob in parentJobLs: workflow.depends(parent=parentJob, child=job) return job
def addFindMaxLDBetweenPeakAndEachLocusJob(self, workflow, executable=None, correlationFile=None, peakLociH5File=None, \ outputFile=None, row_start=None, row_stop=None, \ parentJobLs=[], extraDependentInputLs=[], transferOutput=True, extraArguments=None, \ job_max_memory=100, **keywords): """ 2012.3.10 FindMaxLDBetweenPeakAndEachLocus -j /tmp/peak_59444.h5 -s 0 -t 1000 -i /Network/Data/250k/tmp-yh/pegasus/LD_between_call_32_and_80.2012.3.9T2005/LD/cor_i1_0_4999_i2_0_4999.h5 -o /tmp/output.3.h5 """ job = Job(namespace=workflow.namespace, name=executable.name, version=workflow.version) job.addArguments("-i", correlationFile, "-j", peakLociH5File, "-o", outputFile) if row_start is not None: job.addArguments("-s %s" % (row_start)) if row_stop is not None: job.addArguments("-t %s" % (row_stop)) if extraArguments: job.addArguments(extraArguments) job.uses(peakLociH5File, transfer=False, register=True, link=Link.INPUT) job.uses(correlationFile, transfer=True, register=True, link=Link.INPUT) job.uses(outputFile, transfer=transferOutput, register=True, link=Link.OUTPUT) job.output = outputFile workflow.addJob(job) yh_pegasus.setJobProperRequirement(job, job_max_memory=job_max_memory) for parentJob in parentJobLs: workflow.depends(parent=parentJob, child=job) for input in extraDependentInputLs: job.uses(input, transfer=True, register=True, link=Link.INPUT) return job
def addSplitFastaFileJob(self, executable=None, inputFile=None, outputFnamePrefix=None, \ noOfSequencesPerSplitFile=1000, filenameSuffix="", noOfTotalSequences=1000000,\ parentJobLs=[], extraDependentInputLs=[], transferOutput=False, \ extraArguments=None, job_max_memory=500, **keywords): """ 2012.5.24 """ job = Job(namespace=self.namespace, name=executable.name, version=self.version) noOfSplitFiles = int( math.ceil(noOfTotalSequences / float(noOfSequencesPerSplitFile))) suffixLength = len(repr(noOfSplitFiles)) job.addArguments("-i", inputFile, "--noOfSequences %s"%(noOfSequencesPerSplitFile), \ "--outputFnamePrefix", outputFnamePrefix, '--filenameSuffix %s'%(filenameSuffix), '--suffixLength %s'%(suffixLength)) if extraArguments: job.addArguments(extraArguments) job.uses(inputFile, transfer=True, register=True, link=Link.INPUT) job.outputList = [] for i in xrange(noOfSplitFiles): #start from 0 splitFname = utils.comeUpSplitFilename(outputFnamePrefix=outputFnamePrefix, suffixLength=suffixLength, fileOrder=i,\ filenameSuffix=filenameSuffix) splitFile = File(splitFname) job.outputList.append(splitFile) job.uses(splitFile, transfer=transferOutput, register=True, link=Link.OUTPUT) yh_pegasus.setJobProperRequirement(job, job_max_memory=job_max_memory) self.addJob(job) for parentJob in parentJobLs: if parentJob: self.depends(parent=parentJob, child=job) for input in extraDependentInputLs: if input: job.uses(input, transfer=True, register=True, link=Link.INPUT) return job
def addCountFastqReadBaseCountJob( self, workflow, executable=None, inputFile=None, outputFile=None, isq_id=None, isqf_id=None, parentJobLs=[], extraDependentInputLs=[], transferOutput=True, extraArguments=None, job_max_memory=100, **keywords ): """ 2012.3.14 """ job = Job(namespace=workflow.namespace, name=executable.name, version=workflow.version) job.addArguments("--inputFname", inputFile, "--outputFname", outputFile) if isq_id: job.addArguments("--isq_id %s" % (isq_id)) if isqf_id: job.addArguments("--isqf_id %s" % (isqf_id)) if extraArguments: job.addArguments(extraArguments) job.uses(inputFile, transfer=True, register=True, link=Link.INPUT) job.uses(outputFile, transfer=transferOutput, register=True, link=Link.OUTPUT) job.output = outputFile workflow.addJob(job) yh_pegasus.setJobProperRequirement(job, job_max_memory=job_max_memory) for parentJob in parentJobLs: workflow.depends(parent=parentJob, child=job) for input in extraDependentInputLs: job.uses(input, transfer=True, register=True, link=Link.INPUT) return job
def addAddRG2BamJobsAsNeeded(self, workflow=None, alignmentDataLs=None, site_handler=None, input_site_handler=None, \ addOrReplaceReadGroupsJava=None, AddOrReplaceReadGroupsJar=None, \ BuildBamIndexFilesJava=None, BuildBamIndexJar=None, \ mv=None, \ data_dir=None, tmpDir="/tmp", **keywords): """ 2012.4.5 fix some bugs here 2011-9-15 add a read group only when the alignment doesn't have it according to db record DBVervet.pokeBamReadGroupPresence() from misc.py helps to fill in db records if it's unclear. 2011-9-14 The read-group adding jobs will have a "move" part that overwrites the original bam&bai if site_handler and input_site_handler is same. For those alignment files that don't need to. It doesn't matter. pegasus will transfer/symlink them. """ sys.stderr.write( "Adding add-read-group2BAM jobs for %s alignments if read group is not detected ..." % (len(alignmentDataLs))) if workflow is None: workflow = self job_max_memory = 3500 #in MB javaMemRequirement = "-Xms128m -Xmx%sm" % job_max_memory indexJobMaxMem = 2500 addRG2BamDir = None addRG2BamDirJob = None no_of_rg_jobs = 0 returnData = [] for alignmentData in alignmentDataLs: alignment = alignmentData.alignment parentJobLs = alignmentData.jobLs bamF = alignmentData.bamF baiF = alignmentData.baiF if alignment.read_group_added != 1: if addRG2BamDir is None: addRG2BamDir = "addRG2Bam" addRG2BamDirJob = self.addMkDirJob(outputDir=addRG2BamDir) # add RG to this bam sequencer = alignment.individual_sequence.sequencer #read_group = '%s_%s_%s_%s_vs_%s'%(alignment.id, alignment.ind_seq_id, alignment.individual_sequence.individual.code, \ # sequencer, alignment.ref_ind_seq_id) read_group = alignment.getReadGroup() ##2011-11-02 if sequencer == '454': platform_id = 'LS454' elif sequencer == 'GA': platform_id = 'ILLUMINA' else: platform_id = 'ILLUMINA' # the add-read-group job #addRGJob = Job(namespace=namespace, name=addRGExecutable.name, version=version) addRGJob = Job(namespace=workflow.namespace, name=addOrReplaceReadGroupsJava.name, version=workflow.version) outputRGSAM = File( os.path.join(addRG2BamDir, os.path.basename(alignment.path))) addRGJob.addArguments(javaMemRequirement, '-jar', AddOrReplaceReadGroupsJar, \ "INPUT=", bamF,\ 'RGID=%s'%(read_group), 'RGLB=%s'%(platform_id), 'RGPL=%s'%(platform_id), \ 'RGPU=%s'%(read_group), 'RGSM=%s'%(read_group),\ 'OUTPUT=', outputRGSAM, 'SORT_ORDER=coordinate', "VALIDATION_STRINGENCY=LENIENT") #(adding the SORT_ORDER doesn't do sorting but it marks the header as sorted so that BuildBamIndexJar won't fail.) self.addJobUse(addRGJob, file=AddOrReplaceReadGroupsJar, transfer=True, register=True, link=Link.INPUT) if tmpDir: addRGJob.addArguments("TMP_DIR=%s" % tmpDir) addRGJob.uses(bamF, transfer=True, register=True, link=Link.INPUT) addRGJob.uses(baiF, transfer=True, register=True, link=Link.INPUT) addRGJob.uses(outputRGSAM, transfer=True, register=True, link=Link.OUTPUT) yh_pegasus.setJobProperRequirement( addRGJob, job_max_memory=job_max_memory) for parentJob in parentJobLs: if parentJob: workflow.depends(parent=parentJob, child=addRGJob) workflow.addJob(addRGJob) index_sam_job = self.addBAMIndexJob(workflow, BuildBamIndexFilesJava=workflow.BuildBamIndexFilesJava, BuildBamIndexJar=workflow.BuildBamIndexJar, \ inputBamF=outputRGSAM, parentJobLs=[addRGJob], transferOutput=True, javaMaxMemory=2000) newAlignmentData = PassingData(alignment=alignment) newAlignmentData.jobLs = [index_sam_job, addRGJob] newAlignmentData.bamF = index_sam_job.bamFile newAlignmentData.baiF = index_sam_job.baiFile """ # add the index job to the bamF (needs to be re-indexed) index_sam_job = Job(namespace=namespace, name=BuildBamIndexFilesJava.name, version=version) if input_site_handler==site_handler: #on the same site. overwrite the original file without RG mvJob = Job(namespace=namespace, name=mv.name, version=version) mvJob.addArguments(outputRGSAM, inputFname) #watch, it's inputFname, not input. input is in relative path. #samToBamJob.uses(outputRG, transfer=False, register=True, link=Link.OUTPUT) #don't register it here workflow.addJob(mvJob) workflow.depends(parent=addRGJob, child=mvJob) bai_output = File('%s.bai'%inputFname) #in absolute path, don't register it to the job else: ##on different site, input for index should be outputRGSAM and register it as well mvJob = addRGJob bamF = outputRGSAM addRGJob.uses(outputRGSAM, transfer=True, register=True, link=Link.OUTPUT) bai_output = File('%s.bai'%outputRGSAMFname) index_sam_job.uses(bai_output, transfer=True, register=False, link=Link.OUTPUT) index_sam_job yh_pegasus.setJobProperRequirement(index_sam_job, job_max_memory=indexJobMaxMem) workflow.addJob(index_sam_job) workflow.depends(parent=mvJob, child=index_sam_job) alignmentId2RGJobDataLs[alignment.id]= [index_sam_job, inputFile, bai_output] """ no_of_rg_jobs += 1 else: newAlignmentData = alignmentData returnData.append(newAlignmentData) sys.stderr.write(" %s alignments need read-group addition. Done\n" % (no_of_rg_jobs)) return returnData
def addDrawSNPRegionJob(self, workflow, executable=None, \ inputF=None, call_method_id=None, snpMatrixFile=None, phenotypeFile=None, output_dir=None,\ results_directory=None,analysis_method_id_ls=None, geneAnnotationPickleFile=None,\ list_type_id_list=None, snp_matrix_data_type=1, exclude_accessions_with_NA_phenotype=0,\ snpInfoPickleFile=None, label_gene=1, min_MAF=0.1, min_distance=20000,\ logFile=None,\ parentJobLs=[], job_max_memory=2000, extraDependentInputLs=[], \ transferOutput=False, **keywords): """ 2012.3.22 argument analysis_method_id_ls, list_type_id_list are in string format. "129,137" DrawSNPRegion.py -I /Network/Data/250k/db/dataset/call_method_80.tsv -N /Network/Data/250k/tmp-yh/phenotype/phenotype.tsv -l 129 -o /Network/Data/250k/tmp-yh/snp_region -j /Network/Data/250k/tmp-yh/at_gene_model_pickelf -e 80 -u yh -s -a 1,32 -z banyan -u yh """ job = Job(namespace=workflow.namespace, name=executable.name, version=workflow.version) job.addArguments("-v", self.drivername, "-z", self.hostname, "-d", self.dbname, \ "-u", self.db_user, "-p", self.db_passwd, \ "-i", inputF,\ "--call_method_id=%s"%(call_method_id), \ "-I", snpMatrixFile, \ "-N", phenotypeFile, "--output_dir=%s"%(output_dir), \ "--analysis_method_id_ls=%s"%(analysis_method_id_ls),\ "-j", geneAnnotationPickleFile, "--list_type_id_list=%s"%(list_type_id_list), \ "--snp_matrix_data_type=%s"%(snp_matrix_data_type), \ "--min_MAF=%s"%(min_MAF), "--min_distance=%s"%(min_distance), ) job.uses(inputF, transfer=True, register=True, link=Link.INPUT) job.uses(snpMatrixFile, transfer=True, register=True, link=Link.INPUT) job.uses(phenotypeFile, transfer=True, register=True, link=Link.INPUT) job.uses(geneAnnotationPickleFile, transfer=True, register=True, link=Link.INPUT) if exclude_accessions_with_NA_phenotype: job.addArguments("--exclude_accessions_with_NA_phenotype") if label_gene: job.addArguments("--label_gene") if results_directory: job.addArguments("--results_directory=%s" % (results_directory)) if snpInfoPickleFile: job.addArguments("-F", snpInfoPickleFile) job.uses(snpInfoPickleFile, transfer=True, register=True, link=Link.INPUT) if logFile: job.addArguments("-A", logFile) job.uses(logFile, transfer=transferOutput, register=True, link=Link.OUTPUT) job.output = logFile yh_pegasus.setJobProperRequirement(job, job_max_memory=job_max_memory) workflow.addJob(job) for input in extraDependentInputLs: job.uses(input, transfer=True, register=True, link=Link.INPUT) for parentJob in parentJobLs: workflow.depends(parent=parentJob, child=job) return job
def run(self): """ 2011-9-28 """ if self.debug: import pdb pdb.set_trace() workflow = self.initiateWorkflow() self.registerJars() self.registerExecutables() self.registerCustomExecutables(workflow) site_handler =self.site_handler input_site_handler = self.input_site_handler ref_seq_f = self.registerOneInputFile(workflow, self.ref_seq_fname, folderName=self.pegasusFolderName) query_seq_f = self.registerOneInputFile(workflow, self.query_seq_fname, folderName=self.pegasusFolderName) # Add a mkdir job deltaOutputDir = "delta" deltaOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=self.mkdirWrap, outputDir=deltaOutputDir) coordsOutputDir = "coords" coordsOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=self.mkdirWrap, outputDir=coordsOutputDir) filterOutputDir = "filter" filterOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=self.mkdirWrap, outputDir=filterOutputDir) plotOutputDir = "plot" plotOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=self.mkdirWrap, outputDir=plotOutputDir) #plotScriptOutputDir = "plotScript" #plotScriptOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=self.mkdirWrap, outputDir=plotScriptOutputDir) refNameLs = self.getFastaRecordTitleLs(self.ref_seq_fname) returnData3 = self.addSplitFastaFileJobs(workflow, ref_seq_f, self.SelectAndSplitFastaRecords, refNameLs, mkdirWrap=self.mkdirWrap,\ site_handler=site_handler, namespace=self.namespace, version=self.version, fastaOutputDir='refFasta') refName2splitFastaJobDataLs = returnData3.refName2jobDataLs queryNameLs = self.getFastaRecordTitleLs(self.query_seq_fname) returnData3 = self.addSplitFastaFileJobs(workflow, query_seq_f, self.SelectAndSplitFastaRecords, queryNameLs, mkdirWrap=self.mkdirWrap,\ site_handler=site_handler, namespace=self.namespace, version=self.version, fastaOutputDir='queryFasta') queryName2splitFastaJobDataLs = returnData3.refName2jobDataLs noOfJobs = len(refName2splitFastaJobDataLs) + len(queryName2splitFastaJobDataLs) ref_seq_prefix = os.path.splitext(os.path.basename(ref_seq_f.name))[0] for queryName, jobDataLs in queryName2splitFastaJobDataLs.iteritems(): for refName, refJobDataLs in refName2splitFastaJobDataLs.iteritems(): refSelectAndSplitFastaJob, refFastaFile = refJobDataLs[:2] selectAndSplitFastaJob, fastaFile = jobDataLs[:2] nucmerJob = Job(namespace=self.namespace, name=self.nucmer.name, version=self.version) outputPrefix = "%s_vs_%s_%s"%(queryName, ref_seq_prefix, refName) deltaFnamePrefix = os.path.join(deltaOutputDir, outputPrefix) nucmerJob.addArguments("--maxgap=500", "--mincluster=100", "--prefix", deltaFnamePrefix, \ refFastaFile, fastaFile) nucmerJob.uses(refFastaFile, transfer=False, register=True, link=Link.INPUT) nucmerJob.uses(fastaFile, transfer=False, register=True, link=Link.INPUT) deltaFname = "%s.delta"%(deltaFnamePrefix) deltaF = File(deltaFname) nucmerJob.uses(deltaFname, transfer=True, register=True, link=Link.OUTPUT) #3000M for one nucmer job with human as ref job_max_memory = 5000 #in MB yh_pegasus.setJobProperRequirement(nucmerJob, job_max_memory=job_max_memory) workflow.addJob(nucmerJob) workflow.depends(parent=refSelectAndSplitFastaJob, child=nucmerJob) workflow.depends(parent=selectAndSplitFastaJob, child=nucmerJob) workflow.depends(parent=deltaOutputDirJob, child=nucmerJob) coordsFname = os.path.join(coordsOutputDir, "%s.coords"%(outputPrefix)) coordsF = File(coordsFname) filterFname = os.path.join(filterOutputDir, "%s.filter"%(outputPrefix)) filterF = File(filterFname) plotPrefix = os.path.join(plotOutputDir, "%s_plot"%(outputPrefix)) png_plotF = File("%s.png"%plotPrefix) gp_plotF = File("%s.gp"%plotPrefix) fplot_plotF = File("%s.fplot"%plotPrefix) rplot_plotF = File("%s.rplot"%plotPrefix) postNucJob = Job(namespace=self.namespace, name=self.PostNucmer.name, version=self.version) postNucJob.addArguments(deltaF, coordsF, filterF, refFastaFile, fastaFile, plotPrefix) postNucJob.uses(deltaF, transfer=True, register=True, link=Link.INPUT) postNucJob.uses(refFastaFile, transfer=False, register=True, link=Link.INPUT) postNucJob.uses(fastaFile, transfer=False, register=True, link=Link.INPUT) postNucJob.uses(coordsF, transfer=True, register=True, link=Link.OUTPUT) postNucJob.uses(filterF, transfer=True, register=True, link=Link.OUTPUT) postNucJob.uses(png_plotF, transfer=True, register=True, link=Link.OUTPUT) #leave files below behind #postNucJob.uses(gp_plotF, transfer=True, register=True, link=Link.OUTPUT) #postNucJob.uses(fplot_plotF, transfer=True, register=True, link=Link.OUTPUT) #postNucJob.uses(rplot_plotF, transfer=True, register=True, link=Link.OUTPUT) yh_pegasus.setJobProperRequirement(postNucJob, job_max_memory=2000) workflow.addJob(postNucJob) workflow.depends(parent=nucmerJob, child=postNucJob) workflow.depends(parent=coordsOutputDirJob, child=postNucJob) workflow.depends(parent=filterOutputDirJob, child=postNucJob) workflow.depends(parent=plotOutputDirJob, child=postNucJob) #workflow.depends(parent=plotScriptOutputDirJob, child=postNucJob) noOfJobs += 2 sys.stderr.write(" %s jobs. \n"%(noOfJobs)) # Write the DAX to stdout outf = open(self.outputFname, 'w') workflow.writeXML(outf)