예제 #1
0
    def addPickleGenomeRBDictJob(self, workflow, executable=None, \
          outputF=None, genePadding=None, tax_id=3702,\
          parentJobLs=[], job_max_memory=100, extraDependentInputLs=[], \
          transferOutput=False, **keywords):
        """
		2012.3.22
		"""
        job = Job(namespace=workflow.namespace,
                  name=executable.name,
                  version=workflow.version)

        job.addArguments("-v", self.genome_drivername, "-z", self.genome_hostname, "-d", self.genome_dbname, \
            "-u", self.genome_db_user, "-p", self.genome_db_passwd,\
            "--genePadding=%s"%(genePadding), "--tax_id=%s"%(tax_id), "-o", outputF)
        if self.genome_schema:
            job.addArguments("--schema=%s" % self.genome_schema)
        job.uses(outputF,
                 transfer=transferOutput,
                 register=True,
                 link=Link.OUTPUT)
        job.output = outputF
        yh_pegasus.setJobProperRequirement(job, job_max_memory=job_max_memory)
        workflow.addJob(job)
        for input in extraDependentInputLs:
            job.uses(input, transfer=True, register=True, link=Link.INPUT)
        for parentJob in parentJobLs:
            workflow.depends(parent=parentJob, child=job)
        return job
예제 #2
0
    def addOutputLociIDOfResultPeakInHDF5Job(self, workflow, executable=None, peak_id=None, outputFile=None,\
        parentJobLs=[], extraDependentInputLs=[], transferOutput=True, extraArguments=None, \
        job_max_memory=10, **keywords):
        """
		2012.3.10
			-i 59444 -u yh -z banyan -o /tmp/peak_59444.h5
		"""
        job = Job(namespace=workflow.namespace,
                  name=executable.name,
                  version=workflow.version)
        job.addArguments("-v", self.drivername, "-z", self.hostname, "-d", self.dbname, \
            "-u", self.db_user, "-p", self.db_passwd,\
            "-i", repr(int(peak_id)), "-o", outputFile)

        if extraArguments:
            job.addArguments(extraArguments)
        job.uses(outputFile,
                 transfer=transferOutput,
                 register=True,
                 link=Link.OUTPUT)
        job.output = outputFile
        workflow.addJob(job)
        yh_pegasus.setJobProperRequirement(job, job_max_memory=job_max_memory)
        for parentJob in parentJobLs:
            workflow.depends(parent=parentJob, child=job)
        for input in extraDependentInputLs:
            job.uses(input, transfer=True, register=True, link=Link.INPUT)
        return job
예제 #3
0
    def addSmartpcaJob(
        self,
        workflow,
        executable=None,
        smartpcaParameterFile=None,
        parentJobLs=[],
        extraDependentInputLs=[],
        transferOutput=True,
        extraArguments=None,
        outputFileList=[],
        job_max_memory=100,
        **keywords
    ):
        """
		2012.3.1
		"""
        job = Job(namespace=workflow.namespace, name=executable.name, version=workflow.version)
        job.addArguments("-p", smartpcaParameterFile)
        job.uses(smartpcaParameterFile, transfer=True, register=True, link=Link.INPUT)
        if extraArguments:
            job.addArguments(extraArguments)
        for outputF in outputFileList:
            if outputF:
                job.uses(outputF, transfer=transferOutput, register=True, link=Link.OUTPUT)
        workflow.addJob(job)
        yh_pegasus.setJobProperRequirement(job, job_max_memory=job_max_memory)
        for parentJob in parentJobLs:
            workflow.depends(parent=parentJob, child=job)
        for input in extraDependentInputLs:
            job.uses(input, transfer=True, register=True, link=Link.INPUT)

        return job
예제 #4
0
    def addConvertSNPData2HDF5Job(self, workflow, executable=None, inputFile=None, outputFile=None, min_MAF=None, \
        parentJobLs=[], extraDependentInputLs=[], transferOutput=True, extraArguments=None, \
        job_max_memory=100, **keywords):
        """
		2012.3.2
		"""
        job = Job(namespace=workflow.namespace,
                  name=executable.name,
                  version=workflow.version)
        job.addArguments('-i', inputFile, '-o', outputFile,
                         '-n %s' % (min_MAF))
        job.uses(inputFile, transfer=True, register=True, link=Link.INPUT)
        if extraArguments:
            job.addArguments(extraArguments)
        job.uses(outputFile,
                 transfer=transferOutput,
                 register=True,
                 link=Link.OUTPUT)
        job.output = outputFile
        workflow.addJob(job)
        yh_pegasus.setJobProperRequirement(job, job_max_memory=job_max_memory)
        for parentJob in parentJobLs:
            workflow.depends(parent=parentJob, child=job)
        for input in extraDependentInputLs:
            job.uses(input, transfer=True, register=True, link=Link.INPUT)
        return job
예제 #5
0
    def addDrawManhattanPlotForLDInHDF5Job(self, workflow, executable=None, correlationFile=None, peak_id=None, \
             datasetName=None, outputFile=None,\
        outputFnamePrefix=None, parentJobLs=[], extraDependentInputLs=[], transferOutput=True, extraArguments=None, \
        job_max_memory=10, **keywords):
        """
		2012.3.10
			DrawManhattanPlotForLDInHDF5.py -w secret -i /tmp/output.2.h5 -l 59444 -N correlation
				-O /tmp/gw_LD_pattern_between_snp_and_peak_59444 -u yh -p secret
		"""
        job = Job(namespace=workflow.namespace,
                  name=executable.name,
                  version=workflow.version)
        job.addArguments("-v", self.drivername, "-z", self.hostname, "-d", self.dbname, \
            "-u", self.db_user, "-p", self.db_passwd, "-w", self.genome_db_passwd,\
            "-l %s"%(peak_id),"-N", datasetName, "-i", correlationFile, "-O", outputFnamePrefix)
        if extraArguments:
            job.addArguments(extraArguments)
        job.uses(correlationFile,
                 transfer=True,
                 register=True,
                 link=Link.INPUT)
        job.uses(outputFile,
                 transfer=transferOutput,
                 register=True,
                 link=Link.OUTPUT)
        job.output = outputFile
        workflow.addJob(job)
        yh_pegasus.setJobProperRequirement(job, job_max_memory=job_max_memory)
        for parentJob in parentJobLs:
            workflow.depends(parent=parentJob, child=job)
        for input in extraDependentInputLs:
            job.uses(input, transfer=True, register=True, link=Link.INPUT)
        return job
예제 #6
0
    def addCalculateColCorBetweenTwoHDF5Job(self, workflow, executable=None, inputFile1=None, inputFile2=None, outputFile=None, \
        i1_start=None, i1_stop=None, i2_start=None, i2_stop=None, min_cor=None, \
        parentJobLs=[], extraDependentInputLs=[], transferOutput=True, extraArguments=None, \
        job_max_memory=100, **keywords):
        """
		2012.3.2
		"""
        job = Job(namespace=workflow.namespace,
                  name=executable.name,
                  version=workflow.version)
        job.addArguments('-o', outputFile,'-i', inputFile1, '-j', inputFile2, '-s %s'%i1_start, '-t %s'%i1_stop, \
            '-u %s'%i2_start, '-v %s'%i2_stop, '-c %s'%min_cor)
        job.uses(inputFile1, transfer=True, register=True, link=Link.INPUT)
        job.uses(inputFile2, transfer=True, register=True, link=Link.INPUT)
        if extraArguments:
            job.addArguments(extraArguments)
        job.uses(outputFile,
                 transfer=transferOutput,
                 register=True,
                 link=Link.OUTPUT)
        job.output = outputFile
        workflow.addJob(job)
        yh_pegasus.setJobProperRequirement(job, job_max_memory=job_max_memory)
        for parentJob in parentJobLs:
            workflow.depends(parent=parentJob, child=job)
        for input in extraDependentInputLs:
            job.uses(input, transfer=True, register=True, link=Link.INPUT)

        return job
예제 #7
0
    def addPickleSNPInfoJob(self, workflow, executable=None, \
         outputF=None, call_method_id=None, \
         parentJobLs=[], job_max_memory=100, extraDependentInputLs=[], \
         transferOutput=False, **keywords):
        """
		2012.3.22
		"""
        job = Job(namespace=workflow.namespace,
                  name=executable.name,
                  version=workflow.version)
        job.addArguments("-v", self.drivername, "-z", self.hostname, "-d", self.dbname, \
            "-u", self.db_user, "-p", self.db_passwd, \
            "--call_method_id=%s"%(call_method_id), "-F", outputF)
        job.uses(outputF,
                 transfer=transferOutput,
                 register=True,
                 link=Link.OUTPUT)
        job.output = outputF
        yh_pegasus.setJobProperRequirement(job, job_max_memory=job_max_memory)
        workflow.addJob(job)
        for input in extraDependentInputLs:
            job.uses(input, transfer=True, register=True, link=Link.INPUT)
        for parentJob in parentJobLs:
            workflow.depends(parent=parentJob, child=job)
        return job
    def addGWASPeakOverlapJob(self, workflow, executable=None, \
          result1_id=None, result2_id=None, association1_peak_type_id=None, \
          association2_peak_type_id=None, peak_padding=None, outputF=None, \
          commit=0, results_directory=None, logFile=None, \
          parentJobLs=[], job_max_memory=100, walltime = 60, \
          extraDependentInputLs=[], \
          transferOutput=False, **keywords):
        """
		2012.2.22
			walltime is in minutes (max time allowed on hoffman2 is 24 hours).
			
		"""
        job = Job(namespace=workflow.namespace,
                  name=executable.name,
                  version=workflow.version)
        #apply int because result1_id is of long int. and casting "434L" to integer will raise exception
        job.addArguments("-v", self.drivername, "-z", self.hostname, "-d", self.dbname, \
            "-u", self.db_user, "-p", self.db_passwd,\
            "-i", repr(int(result1_id)), "-j", repr(int(result2_id)), \
            "-x", repr(association1_peak_type_id), "-y", repr(association2_peak_type_id), \
            "-e", repr(peak_padding), "-o", outputF)
        job.uses(outputF,
                 transfer=transferOutput,
                 register=True,
                 link=Link.OUTPUT)
        if commit:
            job.addArguments("-c")
        if results_directory:
            job.addArguments(
                "-t",
                results_directory,
            )
        if self.schema:
            job.addArguments(
                "-k",
                self.schema,
            )
        if logFile:
            job.addArguments("--logFilename=%s" % (logFile.name))
            job.uses(logFile,
                     transfer=transferOutput,
                     register=transferOutput,
                     link=Link.OUTPUT)
            job.output = logFile
        workflow.addJob(job)

        for input in extraDependentInputLs:
            job.uses(input, transfer=True, register=True, link=Link.INPUT)
        for parentJob in parentJobLs:
            workflow.depends(parent=parentJob, child=job)
        yh_pegasus.setJobProperRequirement(job,
                                           job_max_memory=job_max_memory,
                                           walltime=walltime)
        return job
    def addPutReadBaseCountIntoDBJob(
        self,
        workflow,
        executable=None,
        inputFileLs=[],
        logFile=None,
        commit=False,
        parentJobLs=[],
        extraDependentInputLs=[],
        transferOutput=True,
        extraArguments=None,
        job_max_memory=10,
        sshDBTunnel=1,
        **keywords
    ):
        """
		2012.5.3
			add argument sshDBTunnel
		2012.3.14
		"""
        job = Job(namespace=workflow.namespace, name=executable.name, version=workflow.version)
        job.addArguments(
            "--drivername",
            self.drivername,
            "--hostname",
            self.hostname,
            "--dbname",
            self.dbname,
            "--db_user",
            self.db_user,
            "--db_passwd",
            self.db_passwd,
            "--logFilename",
            logFile,
        )
        if extraArguments:
            job.addArguments(extraArguments)
        if commit:
            job.addArguments("--commit")
        for inputFile in inputFileLs:
            job.addArguments(inputFile)
            job.uses(inputFile, transfer=True, register=True, link=Link.INPUT)
        job.uses(logFile, transfer=transferOutput, register=True, link=Link.OUTPUT)
        job.output = logFile
        workflow.addJob(job)
        yh_pegasus.setJobProperRequirement(job, job_max_memory=job_max_memory, sshDBTunnel=sshDBTunnel)
        for parentJob in parentJobLs:
            workflow.depends(parent=parentJob, child=job)
        for input in extraDependentInputLs:
            job.uses(input, transfer=True, register=True, link=Link.INPUT)
        return job
예제 #10
0
    def addConvertVCF2EigenStratJob(
        self,
        workflow,
        executable=None,
        inputF=None,
        outputFnamePrefix=None,
        missingCallAsRefBase=None,
        parentJobLs=[],
        extraDependentInputLs=[],
        transferOutput=True,
        extraArguments=None,
        job_max_memory=100,
        **keywords
    ):
        """
		2012.9.11 add argument missingCallAsRefBase
		2012.3.1
		"""
        job = Job(namespace=workflow.namespace, name=executable.name, version=workflow.version)
        job.addArguments("-i", inputF, "-O", outputFnamePrefix)
        if missingCallAsRefBase:
            job.addArguments("--missingCallAsRefBase")
        if extraArguments:
            job.addArguments(extraArguments)
        job.uses(inputF, transfer=True, register=True, link=Link.INPUT)
        genoOutputF = File("%s.geno" % (outputFnamePrefix))
        locusOutputF = File("%s.snp" % (outputFnamePrefix))
        indOutputF = File("%s.ind" % (outputFnamePrefix))
        outputFLs = [genoOutputF, locusOutputF, indOutputF]
        for outputF in outputFLs:
            job.uses(outputF, transfer=transferOutput, register=True, link=Link.OUTPUT)
        job.genoOutputF = genoOutputF
        job.indOutputF = indOutputF
        job.locusOutputF = locusOutputF
        workflow.addJob(job)
        yh_pegasus.setJobProperRequirement(job, job_max_memory=job_max_memory)
        if parentJobLs:
            for parentJob in parentJobLs:
                if parentJob:
                    workflow.depends(parent=parentJob, child=job)
        if extraDependentInputLs:
            for input in extraDependentInputLs:
                if input:
                    job.uses(input, transfer=True, register=True, link=Link.INPUT)
        return job
    def addPlotPeakOverlapJob(self, workflow, executable=None, \
          outputFnamePrefix=None, \
          parentJobLs=[], job_max_memory=100, walltime = 60, \
          extraDependentInputLs=[], \
          transferOutput=False, **keywords):
        """
		2012.2.22
			walltime is in minutes (max time allowed on hoffman2 is 24 hours).
			
		"""
        job = Job(namespace=workflow.namespace,
                  name=executable.name,
                  version=workflow.version)

        job.addArguments("-o", outputFnamePrefix)

        job.uses(File("%s_hist_of_fraction_of_association1_peaks_in_result2.png"%outputFnamePrefix), \
             transfer=True, register=True, link=Link.OUTPUT)
        job.uses(File("%s_hist_of_fraction_of_association2_peaks_in_result1.png"%outputFnamePrefix), \
             transfer=True, register=True, link=Link.OUTPUT)
        job.uses(File("%s_hist_of_fraction_of_recurrent_peaks_in_combined.png"%outputFnamePrefix), \
             transfer=True, register=True, link=Link.OUTPUT)
        job.uses(File("%s_no_of_peaks_result1_vs_result2.png"%outputFnamePrefix), \
             transfer=True, register=True, link=Link.OUTPUT)
        job.uses(File("%s_result1_no_of_peak_vs_fraction.png"%outputFnamePrefix), \
             transfer=True, register=True, link=Link.OUTPUT)
        job.uses(File("%s_result2_no_of_peak_vs_fraction.png"%outputFnamePrefix), \
             transfer=True, register=True, link=Link.OUTPUT)
        job.uses(File("%s_1_fraction_in2_vs_2_fraction_in1.png"%outputFnamePrefix), \
             transfer=True, register=True, link=Link.OUTPUT)
        job.uses(File("%s_combined_no_of_peak_vs_fraction.png"%outputFnamePrefix), \
             transfer=True, register=True, link=Link.OUTPUT)

        workflow.addJob(job)
        for input in extraDependentInputLs:
            job.uses(input, transfer=True, register=True, link=Link.INPUT)
        for parentJob in parentJobLs:
            workflow.depends(parent=parentJob, child=job)
        yh_pegasus.setJobProperRequirement(job,
                                           job_max_memory=job_max_memory,
                                           walltime=walltime)
        return job
예제 #12
0
    def addOutputMultiGWASOverlapPeakSpanJob(self, workflow, executable=None, \
         outputF=None, peakPadding=None,list_type_id_list=None, result_id_peak_type_id_ls=None,\
         genePadding=None, tax_id=3702, genomeRBDictPickleFile=None, \
         parentJobLs=[], job_max_memory=100, extraDependentInputLs=[], \
         transferOutput=False, **keywords):
        """
		2012.3.22
			argument list_type_id_list is in string format. "129,137"
		"""
        job = Job(namespace=workflow.namespace,
                  name=executable.name,
                  version=workflow.version)
        job.addArguments("-v", self.drivername, "-z", self.hostname, "-d", self.dbname, \
            "-u", self.db_user, "-p", self.db_passwd, \
            "--genome_drivername=%s"%self.genome_drivername, "--genome_hostname=%s"%self.genome_hostname, \
            "--genome_dbname=%s"%self.genome_dbname, \
            "--genome_db_user=%s"%self.genome_db_user, "--genome_schema=%s"%self.genome_schema, \
            "--genome_db_passwd=%s"%self.genome_db_passwd,\
            "--peakPadding=%s"%(peakPadding), \
            "--result_id_peak_type_id_ls=%s"%(result_id_peak_type_id_ls), \
            "--genePadding=%s"%(genePadding), \
            "--tax_id=%s"%(tax_id), "-o", outputF)
        if list_type_id_list:
            job.addArguments("--list_type_id_list=%s" % (list_type_id_list))
        if genomeRBDictPickleFile:
            job.addArguments("-m", genomeRBDictPickleFile)
            job.uses(genomeRBDictPickleFile,
                     transfer=True,
                     register=True,
                     link=Link.INPUT)
        job.uses(outputF,
                 transfer=transferOutput,
                 register=True,
                 link=Link.OUTPUT)
        job.output = outputF
        yh_pegasus.setJobProperRequirement(job, job_max_memory=job_max_memory)
        workflow.addJob(job)
        for input in extraDependentInputLs:
            job.uses(input, transfer=True, register=True, link=Link.INPUT)
        for parentJob in parentJobLs:
            workflow.depends(parent=parentJob, child=job)
        return job
예제 #13
0
    def addFindMaxLDBetweenPeakAndEachLocusJob(self, workflow, executable=None, correlationFile=None, peakLociH5File=None, \
              outputFile=None, row_start=None, row_stop=None, \
        parentJobLs=[], extraDependentInputLs=[], transferOutput=True, extraArguments=None, \
        job_max_memory=100, **keywords):
        """
		2012.3.10
			FindMaxLDBetweenPeakAndEachLocus -j /tmp/peak_59444.h5
				-s 0 -t 1000
				-i /Network/Data/250k/tmp-yh/pegasus/LD_between_call_32_and_80.2012.3.9T2005/LD/cor_i1_0_4999_i2_0_4999.h5
				-o /tmp/output.3.h5
		"""
        job = Job(namespace=workflow.namespace,
                  name=executable.name,
                  version=workflow.version)
        job.addArguments("-i", correlationFile, "-j", peakLociH5File, "-o",
                         outputFile)
        if row_start is not None:
            job.addArguments("-s %s" % (row_start))
        if row_stop is not None:
            job.addArguments("-t %s" % (row_stop))
        if extraArguments:
            job.addArguments(extraArguments)
        job.uses(peakLociH5File,
                 transfer=False,
                 register=True,
                 link=Link.INPUT)
        job.uses(correlationFile,
                 transfer=True,
                 register=True,
                 link=Link.INPUT)
        job.uses(outputFile,
                 transfer=transferOutput,
                 register=True,
                 link=Link.OUTPUT)
        job.output = outputFile
        workflow.addJob(job)
        yh_pegasus.setJobProperRequirement(job, job_max_memory=job_max_memory)
        for parentJob in parentJobLs:
            workflow.depends(parent=parentJob, child=job)
        for input in extraDependentInputLs:
            job.uses(input, transfer=True, register=True, link=Link.INPUT)
        return job
예제 #14
0
    def addSplitFastaFileJob(self, executable=None, inputFile=None, outputFnamePrefix=None, \
         noOfSequencesPerSplitFile=1000, filenameSuffix="", noOfTotalSequences=1000000,\
         parentJobLs=[], extraDependentInputLs=[], transferOutput=False, \
         extraArguments=None, job_max_memory=500, **keywords):
        """
		2012.5.24
		"""
        job = Job(namespace=self.namespace,
                  name=executable.name,
                  version=self.version)

        noOfSplitFiles = int(
            math.ceil(noOfTotalSequences / float(noOfSequencesPerSplitFile)))
        suffixLength = len(repr(noOfSplitFiles))

        job.addArguments("-i", inputFile, "--noOfSequences %s"%(noOfSequencesPerSplitFile), \
            "--outputFnamePrefix", outputFnamePrefix, '--filenameSuffix %s'%(filenameSuffix), '--suffixLength %s'%(suffixLength))
        if extraArguments:
            job.addArguments(extraArguments)
        job.uses(inputFile, transfer=True, register=True, link=Link.INPUT)
        job.outputList = []
        for i in xrange(noOfSplitFiles):  #start from 0
            splitFname = utils.comeUpSplitFilename(outputFnamePrefix=outputFnamePrefix, suffixLength=suffixLength, fileOrder=i,\
                    filenameSuffix=filenameSuffix)
            splitFile = File(splitFname)

            job.outputList.append(splitFile)
            job.uses(splitFile,
                     transfer=transferOutput,
                     register=True,
                     link=Link.OUTPUT)

        yh_pegasus.setJobProperRequirement(job, job_max_memory=job_max_memory)
        self.addJob(job)
        for parentJob in parentJobLs:
            if parentJob:
                self.depends(parent=parentJob, child=job)
        for input in extraDependentInputLs:
            if input:
                job.uses(input, transfer=True, register=True, link=Link.INPUT)
        return job
    def addCountFastqReadBaseCountJob(
        self,
        workflow,
        executable=None,
        inputFile=None,
        outputFile=None,
        isq_id=None,
        isqf_id=None,
        parentJobLs=[],
        extraDependentInputLs=[],
        transferOutput=True,
        extraArguments=None,
        job_max_memory=100,
        **keywords
    ):
        """
		2012.3.14
		"""
        job = Job(namespace=workflow.namespace, name=executable.name, version=workflow.version)
        job.addArguments("--inputFname", inputFile, "--outputFname", outputFile)
        if isq_id:
            job.addArguments("--isq_id %s" % (isq_id))
        if isqf_id:
            job.addArguments("--isqf_id %s" % (isqf_id))
        if extraArguments:
            job.addArguments(extraArguments)
        job.uses(inputFile, transfer=True, register=True, link=Link.INPUT)
        job.uses(outputFile, transfer=transferOutput, register=True, link=Link.OUTPUT)
        job.output = outputFile
        workflow.addJob(job)
        yh_pegasus.setJobProperRequirement(job, job_max_memory=job_max_memory)
        for parentJob in parentJobLs:
            workflow.depends(parent=parentJob, child=job)
        for input in extraDependentInputLs:
            job.uses(input, transfer=True, register=True, link=Link.INPUT)
        return job
    def addAddRG2BamJobsAsNeeded(self, workflow=None, alignmentDataLs=None, site_handler=None, input_site_handler=None, \
          addOrReplaceReadGroupsJava=None, AddOrReplaceReadGroupsJar=None, \
          BuildBamIndexFilesJava=None, BuildBamIndexJar=None, \
          mv=None, \
          data_dir=None, tmpDir="/tmp", **keywords):
        """
		2012.4.5
			fix some bugs here
		2011-9-15
			add a read group only when the alignment doesn't have it according to db record
			DBVervet.pokeBamReadGroupPresence() from misc.py helps to fill in db records if it's unclear.
		2011-9-14
			The read-group adding jobs will have a "move" part that overwrites the original bam&bai if site_handler and input_site_handler is same.
			For those alignment files that don't need to. It doesn't matter. pegasus will transfer/symlink them.
		"""
        sys.stderr.write(
            "Adding add-read-group2BAM jobs for %s alignments if read group is not detected ..."
            % (len(alignmentDataLs)))
        if workflow is None:
            workflow = self
        job_max_memory = 3500  #in MB
        javaMemRequirement = "-Xms128m -Xmx%sm" % job_max_memory
        indexJobMaxMem = 2500

        addRG2BamDir = None
        addRG2BamDirJob = None

        no_of_rg_jobs = 0
        returnData = []
        for alignmentData in alignmentDataLs:
            alignment = alignmentData.alignment
            parentJobLs = alignmentData.jobLs
            bamF = alignmentData.bamF
            baiF = alignmentData.baiF
            if alignment.read_group_added != 1:
                if addRG2BamDir is None:
                    addRG2BamDir = "addRG2Bam"
                    addRG2BamDirJob = self.addMkDirJob(outputDir=addRG2BamDir)

                # add RG to this bam
                sequencer = alignment.individual_sequence.sequencer
                #read_group = '%s_%s_%s_%s_vs_%s'%(alignment.id, alignment.ind_seq_id, alignment.individual_sequence.individual.code, \
                #						sequencer, alignment.ref_ind_seq_id)
                read_group = alignment.getReadGroup()  ##2011-11-02
                if sequencer == '454':
                    platform_id = 'LS454'
                elif sequencer == 'GA':
                    platform_id = 'ILLUMINA'
                else:
                    platform_id = 'ILLUMINA'

                # the add-read-group job
                #addRGJob = Job(namespace=namespace, name=addRGExecutable.name, version=version)
                addRGJob = Job(namespace=workflow.namespace,
                               name=addOrReplaceReadGroupsJava.name,
                               version=workflow.version)
                outputRGSAM = File(
                    os.path.join(addRG2BamDir,
                                 os.path.basename(alignment.path)))

                addRGJob.addArguments(javaMemRequirement, '-jar', AddOrReplaceReadGroupsJar, \
                     "INPUT=", bamF,\
                     'RGID=%s'%(read_group), 'RGLB=%s'%(platform_id), 'RGPL=%s'%(platform_id), \
                     'RGPU=%s'%(read_group), 'RGSM=%s'%(read_group),\
                     'OUTPUT=', outputRGSAM, 'SORT_ORDER=coordinate', "VALIDATION_STRINGENCY=LENIENT")
                #(adding the SORT_ORDER doesn't do sorting but it marks the header as sorted so that BuildBamIndexJar won't fail.)
                self.addJobUse(addRGJob,
                               file=AddOrReplaceReadGroupsJar,
                               transfer=True,
                               register=True,
                               link=Link.INPUT)
                if tmpDir:
                    addRGJob.addArguments("TMP_DIR=%s" % tmpDir)
                addRGJob.uses(bamF,
                              transfer=True,
                              register=True,
                              link=Link.INPUT)
                addRGJob.uses(baiF,
                              transfer=True,
                              register=True,
                              link=Link.INPUT)
                addRGJob.uses(outputRGSAM,
                              transfer=True,
                              register=True,
                              link=Link.OUTPUT)
                yh_pegasus.setJobProperRequirement(
                    addRGJob, job_max_memory=job_max_memory)
                for parentJob in parentJobLs:
                    if parentJob:
                        workflow.depends(parent=parentJob, child=addRGJob)
                workflow.addJob(addRGJob)


                index_sam_job = self.addBAMIndexJob(workflow, BuildBamIndexFilesJava=workflow.BuildBamIndexFilesJava, BuildBamIndexJar=workflow.BuildBamIndexJar, \
                 inputBamF=outputRGSAM, parentJobLs=[addRGJob], transferOutput=True, javaMaxMemory=2000)
                newAlignmentData = PassingData(alignment=alignment)
                newAlignmentData.jobLs = [index_sam_job, addRGJob]
                newAlignmentData.bamF = index_sam_job.bamFile
                newAlignmentData.baiF = index_sam_job.baiFile
                """
				# add the index job to the bamF (needs to be re-indexed)
				index_sam_job = Job(namespace=namespace, name=BuildBamIndexFilesJava.name, version=version)
				
				if input_site_handler==site_handler:	#on the same site. overwrite the original file without RG
					mvJob = Job(namespace=namespace, name=mv.name, version=version)
					mvJob.addArguments(outputRGSAM, inputFname)	#watch, it's inputFname, not input. input is in relative path.
					#samToBamJob.uses(outputRG, transfer=False, register=True, link=Link.OUTPUT)	#don't register it here
					workflow.addJob(mvJob)
					workflow.depends(parent=addRGJob, child=mvJob)
					bai_output = File('%s.bai'%inputFname)	#in absolute path, don't register it to the job
				else:
					##on different site, input for index should be outputRGSAM and register it as well
					mvJob = addRGJob
					bamF = outputRGSAM	
					addRGJob.uses(outputRGSAM, transfer=True, register=True, link=Link.OUTPUT)
					bai_output = File('%s.bai'%outputRGSAMFname)
					index_sam_job.uses(bai_output, transfer=True, register=False, link=Link.OUTPUT)
				
				index_sam_job
				
				yh_pegasus.setJobProperRequirement(index_sam_job, job_max_memory=indexJobMaxMem)
				
				workflow.addJob(index_sam_job)
				workflow.depends(parent=mvJob, child=index_sam_job)
				alignmentId2RGJobDataLs[alignment.id]= [index_sam_job, inputFile, bai_output]
				"""
                no_of_rg_jobs += 1
            else:
                newAlignmentData = alignmentData
            returnData.append(newAlignmentData)
        sys.stderr.write(" %s alignments need read-group addition. Done\n" %
                         (no_of_rg_jobs))
        return returnData
예제 #17
0
    def addDrawSNPRegionJob(self, workflow, executable=None, \
       inputF=None, call_method_id=None, snpMatrixFile=None, phenotypeFile=None, output_dir=None,\
       results_directory=None,analysis_method_id_ls=None, geneAnnotationPickleFile=None,\
       list_type_id_list=None, snp_matrix_data_type=1, exclude_accessions_with_NA_phenotype=0,\
       snpInfoPickleFile=None, label_gene=1, min_MAF=0.1, min_distance=20000,\
       logFile=None,\
       parentJobLs=[], job_max_memory=2000, extraDependentInputLs=[], \
       transferOutput=False, **keywords):
        """
		2012.3.22
			argument analysis_method_id_ls, list_type_id_list are in string format. "129,137"
		
			DrawSNPRegion.py -I /Network/Data/250k/db/dataset/call_method_80.tsv -N /Network/Data/250k/tmp-yh/phenotype/phenotype.tsv 
				-l 129 -o /Network/Data/250k/tmp-yh/snp_region -j /Network/Data/250k/tmp-yh/at_gene_model_pickelf 
				-e 80 -u yh -s -a 1,32 -z banyan -u yh
		"""
        job = Job(namespace=workflow.namespace,
                  name=executable.name,
                  version=workflow.version)
        job.addArguments("-v", self.drivername, "-z", self.hostname, "-d", self.dbname, \
            "-u", self.db_user, "-p", self.db_passwd, \
            "-i", inputF,\
            "--call_method_id=%s"%(call_method_id), \
            "-I", snpMatrixFile, \
            "-N", phenotypeFile, "--output_dir=%s"%(output_dir), \
            "--analysis_method_id_ls=%s"%(analysis_method_id_ls),\
            "-j", geneAnnotationPickleFile, "--list_type_id_list=%s"%(list_type_id_list), \
            "--snp_matrix_data_type=%s"%(snp_matrix_data_type), \
            "--min_MAF=%s"%(min_MAF), "--min_distance=%s"%(min_distance),
            )
        job.uses(inputF, transfer=True, register=True, link=Link.INPUT)
        job.uses(snpMatrixFile, transfer=True, register=True, link=Link.INPUT)
        job.uses(phenotypeFile, transfer=True, register=True, link=Link.INPUT)
        job.uses(geneAnnotationPickleFile,
                 transfer=True,
                 register=True,
                 link=Link.INPUT)
        if exclude_accessions_with_NA_phenotype:
            job.addArguments("--exclude_accessions_with_NA_phenotype")
        if label_gene:
            job.addArguments("--label_gene")
        if results_directory:
            job.addArguments("--results_directory=%s" % (results_directory))
        if snpInfoPickleFile:
            job.addArguments("-F", snpInfoPickleFile)
            job.uses(snpInfoPickleFile,
                     transfer=True,
                     register=True,
                     link=Link.INPUT)
        if logFile:
            job.addArguments("-A", logFile)
            job.uses(logFile,
                     transfer=transferOutput,
                     register=True,
                     link=Link.OUTPUT)
            job.output = logFile
        yh_pegasus.setJobProperRequirement(job, job_max_memory=job_max_memory)
        workflow.addJob(job)
        for input in extraDependentInputLs:
            job.uses(input, transfer=True, register=True, link=Link.INPUT)
        for parentJob in parentJobLs:
            workflow.depends(parent=parentJob, child=job)
        return job
	def run(self):
		"""
		2011-9-28
		"""
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		
		workflow = self.initiateWorkflow()
		
		self.registerJars()
		self.registerExecutables()
		self.registerCustomExecutables(workflow)
		site_handler =self.site_handler
		input_site_handler = self.input_site_handler
		
		ref_seq_f = self.registerOneInputFile(workflow, self.ref_seq_fname, folderName=self.pegasusFolderName)
		
		query_seq_f = self.registerOneInputFile(workflow, self.query_seq_fname, folderName=self.pegasusFolderName)
		
		# Add a mkdir job
		deltaOutputDir = "delta"
		deltaOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=self.mkdirWrap, outputDir=deltaOutputDir)
		
		coordsOutputDir = "coords"
		coordsOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=self.mkdirWrap, outputDir=coordsOutputDir)
		
		filterOutputDir = "filter"
		filterOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=self.mkdirWrap, outputDir=filterOutputDir)
		
		plotOutputDir = "plot"
		plotOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=self.mkdirWrap, outputDir=plotOutputDir)
		
		#plotScriptOutputDir = "plotScript"
		#plotScriptOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=self.mkdirWrap, outputDir=plotScriptOutputDir)
		
		refNameLs = self.getFastaRecordTitleLs(self.ref_seq_fname)
		returnData3 = self.addSplitFastaFileJobs(workflow, ref_seq_f, self.SelectAndSplitFastaRecords, refNameLs, mkdirWrap=self.mkdirWrap,\
						site_handler=site_handler, namespace=self.namespace, version=self.version, fastaOutputDir='refFasta')
		refName2splitFastaJobDataLs = returnData3.refName2jobDataLs
		
		queryNameLs = self.getFastaRecordTitleLs(self.query_seq_fname)
		returnData3 = self.addSplitFastaFileJobs(workflow, query_seq_f, self.SelectAndSplitFastaRecords, queryNameLs, mkdirWrap=self.mkdirWrap,\
						site_handler=site_handler, namespace=self.namespace, version=self.version, fastaOutputDir='queryFasta')
		queryName2splitFastaJobDataLs = returnData3.refName2jobDataLs
		
		noOfJobs = len(refName2splitFastaJobDataLs) + len(queryName2splitFastaJobDataLs)
		ref_seq_prefix = os.path.splitext(os.path.basename(ref_seq_f.name))[0]
		for queryName, jobDataLs in queryName2splitFastaJobDataLs.iteritems():
			for refName, refJobDataLs in refName2splitFastaJobDataLs.iteritems():
				refSelectAndSplitFastaJob, refFastaFile = refJobDataLs[:2]
				selectAndSplitFastaJob, fastaFile = jobDataLs[:2]
				nucmerJob = Job(namespace=self.namespace, name=self.nucmer.name, version=self.version)
				outputPrefix = "%s_vs_%s_%s"%(queryName, ref_seq_prefix, refName)
				deltaFnamePrefix = os.path.join(deltaOutputDir, outputPrefix)
				nucmerJob.addArguments("--maxgap=500", "--mincluster=100", "--prefix", deltaFnamePrefix, \
									refFastaFile, fastaFile)
				nucmerJob.uses(refFastaFile, transfer=False, register=True, link=Link.INPUT)
				nucmerJob.uses(fastaFile, transfer=False, register=True, link=Link.INPUT)
				deltaFname = "%s.delta"%(deltaFnamePrefix)
				deltaF = File(deltaFname)
				nucmerJob.uses(deltaFname, transfer=True, register=True, link=Link.OUTPUT)
				#3000M for one nucmer job with human as ref
				job_max_memory = 5000	#in MB
				yh_pegasus.setJobProperRequirement(nucmerJob, job_max_memory=job_max_memory)
				workflow.addJob(nucmerJob)
				
				workflow.depends(parent=refSelectAndSplitFastaJob, child=nucmerJob)
				workflow.depends(parent=selectAndSplitFastaJob, child=nucmerJob)
				workflow.depends(parent=deltaOutputDirJob, child=nucmerJob)
				
				coordsFname = os.path.join(coordsOutputDir, "%s.coords"%(outputPrefix))
				coordsF = File(coordsFname)
				filterFname = os.path.join(filterOutputDir, "%s.filter"%(outputPrefix))
				filterF = File(filterFname)
				plotPrefix = os.path.join(plotOutputDir, "%s_plot"%(outputPrefix))
				png_plotF = File("%s.png"%plotPrefix)
				gp_plotF = File("%s.gp"%plotPrefix)
				fplot_plotF = File("%s.fplot"%plotPrefix)
				rplot_plotF = File("%s.rplot"%plotPrefix)
				postNucJob = Job(namespace=self.namespace, name=self.PostNucmer.name, version=self.version)
				postNucJob.addArguments(deltaF, coordsF, filterF, refFastaFile, fastaFile, plotPrefix)
				postNucJob.uses(deltaF, transfer=True, register=True, link=Link.INPUT)
				postNucJob.uses(refFastaFile, transfer=False, register=True, link=Link.INPUT)
				postNucJob.uses(fastaFile, transfer=False, register=True, link=Link.INPUT)
				
				postNucJob.uses(coordsF, transfer=True, register=True, link=Link.OUTPUT)
				postNucJob.uses(filterF, transfer=True, register=True, link=Link.OUTPUT)
				postNucJob.uses(png_plotF, transfer=True, register=True, link=Link.OUTPUT)
				#leave files below behind
				#postNucJob.uses(gp_plotF, transfer=True, register=True, link=Link.OUTPUT)
				#postNucJob.uses(fplot_plotF, transfer=True, register=True, link=Link.OUTPUT)
				#postNucJob.uses(rplot_plotF, transfer=True, register=True, link=Link.OUTPUT)
				
				yh_pegasus.setJobProperRequirement(postNucJob, job_max_memory=2000)
				workflow.addJob(postNucJob)
				workflow.depends(parent=nucmerJob, child=postNucJob)
				workflow.depends(parent=coordsOutputDirJob, child=postNucJob)
				workflow.depends(parent=filterOutputDirJob, child=postNucJob)
				workflow.depends(parent=plotOutputDirJob, child=postNucJob)
				#workflow.depends(parent=plotScriptOutputDirJob, child=postNucJob)
				noOfJobs += 2
		sys.stderr.write(" %s jobs. \n"%(noOfJobs))
		# Write the DAX to stdout
		outf = open(self.outputFname, 'w')
		workflow.writeXML(outf)