def addMakeBlastDBJob(self, executable:Transformation=None, inputFile=None, \ parentJobLs=None, extraDependentInputLs=None, transferOutput=False, \ extraArguments=None, job_max_memory=500, **keywords): """ 2012.10.9 use addGenericJob() instead 2012.5.24 untested """ extraOutputLs = [] for suffix in ['.nin', '.nhr', '.nsq']: #start from 0 dbIndexFile = File('%s%s'%(inputFile.name, suffix)) extraOutputLs.append(dbIndexFile) # 2013.07.09 extraOutputLs.append(File("formatdb.log")) extraArgumentList = ["-p F"] job = self.addGenericJob(executable=executable, inputFile=inputFile, outputFile=None, \ extraArguments=extraArguments, extraArgumentList=extraArgumentList, \ parentJobLs=parentJobLs, extraDependentInputLs=extraDependentInputLs, \ extraOutputLs=extraOutputLs,\ transferOutput=transferOutput, \ key2ObjectForJob=None,\ job_max_memory=job_max_memory) return job
def mapEachInterval(self, inputJobData=None, selectIntervalJobData=None, \ chromosome=None,intervalData=None,\ mapEachChromosomeData=None, \ passingData=None, transferOutput=False, **keywords): """ #. extract flanking sequences from the input Input (ref sequence file => contig ref sequence) #. blast them #. run FindSNPPositionOnNewRefFromFlankingBlastOutput.py #. where hit length match query length, and # no of mismatches <=2 => good => infer new coordinates #. output a mapping file between old SNP and new SNP coordinates. #. reduce this thing by combining everything #. make a new Input file based on the input split Input file (replace contig ID , position with the new one's, remove the header part regarding chromosomes or replace it) """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] passingData.intervalFileBasenamePrefix passingData.splitInputFile """ ## 2013.06.19 structures available from passingData, specific to the interval passingData.splitInputFile = splitInputFile passingData.unitNumber = unitNumber passingData.intervalFileBasenamePrefix = '%s_%s_splitInput_u%s'%( chromosome, commonPrefix, unitNumber) passingData.noOfIndividuals = jobData.file.noOfIndividuals passingData.span = self.intervalSize + self.intervalOverlapSize*2 """ #add one computing job outputFile = File(os.path.join(self.mapDirJob.output, "%s.%s.probability.tsv.gz"%(passingData.fileBasenamePrefix,\ intervalData.interval))) locusIntervalDeltaOutputFile = File(os.path.join(self.mapDirJob.output, "%s.%s.locusIntervalDelta.tsv.gz"%(passingData.fileBasenamePrefix, intervalData.interval))) job = self.addAbstractMatrixFileWalkerJob( executable=self.ComputeLiftOverLocusProbability, \ inputFile=selectIntervalJobData.file, outputFile=outputFile, \ whichColumn=None, whichColumnHeader=None, \ logY=None, valueForNonPositiveYValue=-1, \ minNoOfTotal=1, samplingRate=1, \ inputFileFormat=None, outputFileFormat=None,\ extraArgumentList=["--locusIntervalDeltaOutputFname", locusIntervalDeltaOutputFile, \ "--startPosition %s"%(intervalData.start), "--stopPosition %s"%(intervalData.stop)], parentJobLs=[selectIntervalJobData.job], extraOutputLs=[locusIntervalDeltaOutputFile],\ transferOutput=transferOutput, job_max_memory=2000, sshDBTunnel=False) #For each interval, probabilities are not calculated for loci in # extra segment (from overlapStart to start). returnData.jobDataLs.append(self.constructJobDataFromJob(job)) return returnData
def reduceEachChromosome(self, chromosome=None, passingData=None, mapEachInputDataLs=None, chromosome2mapEachIntervalDataLs=None,\ reduceEachInputDataLs=None,\ transferOutput=True, \ **keywords): """ """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] returnData.mapEachInputDataLs = mapEachInputDataLs returnData.reduceEachInputDataLs = reduceEachInputDataLs #reduce matrix by chosen column and average p-value outputFile = File(os.path.join(self.reduceEachChromosomeDirJob.output, 'chr_%s_LocusLiftOverProbability.tsv.gz'%(chromosome))) reduceChromosomeJob = self.addStatMergeJob( statMergeProgram=self.mergeSameHeaderTablesIntoOne, \ outputF=outputFile, \ parentJobLs=[self.reduceEachChromosomeDirJob],extraOutputLs=None, \ extraDependentInputLs=None, transferOutput=False) #extraArgumentList=['--keyColumnLs 0-6 --valueColumnLs 7'],\ mapEachIntervalDataLs = chromosome2mapEachIntervalDataLs.get(chromosome) for mapEachIntervalData in mapEachIntervalDataLs: for jobData in mapEachIntervalData.jobDataLs: self.addInputToMergeJob(reduceChromosomeJob, parentJobLs=[jobData.job]) #add the reduction job to final stat merge job self.addInputToMergeJob(self.reduceJob, parentJobLs=[reduceChromosomeJob]) return returnData
def addSplitFastaFileJob(self, executable:Transformation=None, inputFile:File=None, outputFnamePrefix=None, \ noOfSequencesPerSplitFile=1000, filenameSuffix="", noOfTotalSequences=1000000,\ parentJobLs=[], extraDependentInputLs=[], transferOutput=False, \ extraArguments=None, job_max_memory=500, **keywords): """ 2012.5.24 """ noOfSplitFiles = int(math.ceil(noOfTotalSequences/float(noOfSequencesPerSplitFile))) suffixLength = len(repr(noOfSplitFiles)) job = self.addGenericJob(executable=executable, inputArgumentOption="-i", inputFile=inputFile, extraArguments=extraArguments, \ extraArgumentList=["--noOfSequences %s"%(noOfSequencesPerSplitFile), \ "--outputFnamePrefix", outputFnamePrefix, '--filenameSuffix %s'%(filenameSuffix), '--suffixLength %s'%(suffixLength)], parentJobLs=parentJobLs, extraDependentInputLs=extraDependentInputLs, job_max_memory=job_max_memory) for i in range(noOfSplitFiles): #start from 0 splitFname = utils.comeUpSplitFilename(outputFnamePrefix=outputFnamePrefix, suffixLength=suffixLength, fileOrder=i,\ filenameSuffix=filenameSuffix) splitFile = File(splitFname) self.addJobUse(job, file=splitFile, is_input=False, transfer=transferOutput) return job
def addJobs(self, inputURL=None, relativePathList =[], outputDir="", username=None, password=None, \ transferOutput=True): """ 2012.6.27 """ sys.stderr.write("Adding wget jobs for %s input ... " % (len(relativePathList))) no_of_jobs = 0 topOutputDir = outputDir topOutputDirJob = self.addMkDirJob(outputDir=topOutputDir) no_of_jobs += 1 returnData = PassingData() returnData.jobDataLs = [] for relativePath in relativePathList: #2013.06.26 remove all "/" from relativePath in case it's a folder relativePathNoFolder = relativePath.replace('/', '_') logFile = File('%s.log' % (relativePathNoFolder)) wgetJob = self.addWgetJob(executable=self.wget, url=inputURL, relativePath=relativePath, \ username=username, password=password,\ targetFolder=outputDir, logFile=logFile, cut_dir_number=self.cut_dir_number, parentJobLs=[topOutputDirJob], extraDependentInputLs=[], \ transferOutput=transferOutput, \ extraArguments=None, job_max_memory=50) #include the tfam (outputList[1]) into the fileLs returnData.jobDataLs.append(PassingData(jobLs=[wgetJob], file=wgetJob.output, \ fileLs=wgetJob.outputLs)) no_of_jobs += 1 sys.stderr.write("%s jobs.\n" % (no_of_jobs)) return returnData
def reduce(self, passingData=None, reduceEachChromosomeDataLs=None, transferOutput=True, **keywords): """ #. merge all output of input jobs (passingData.mapEachIntervalDataLsLs) into one big one """ returnData = PassingData(no_of_jobs=0) returnData.jobDataLs = [] reduceOutputDirJob = passingData.reduceOutputDirJob realInputVolume = passingData.jobData.file.noOfIndividuals * passingData.jobData.file.noOfLoci baseInputVolume = 200 * 20000 walltime = self.scaleJobWalltimeOrMemoryBasedOnInput( realInputVolume=realInputVolume, baseInputVolume=baseInputVolume, baseJobPropertyValue=60, minJobPropertyValue=60, maxJobPropertyValue=500).value job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput( realInputVolume=realInputVolume, baseInputVolume=baseInputVolume, baseJobPropertyValue=5000, minJobPropertyValue=5000, maxJobPropertyValue=10000).value outputFile = File( os.path.join(reduceOutputDirJob.output, 'sameSiteConcordance.tsv')) reduceJob = self.addStatMergeJob( statMergeProgram=self.mergeSameHeaderTablesIntoOne, outputF=outputFile, parentJobLs=[reduceOutputDirJob], transferOutput=transferOutput, ) returnData.jobDataLs.append( PassingData(jobLs=[reduceJob], file=reduceJob.output, fileLs=[reduceJob.output])) for mapEachIntervalDataLs in passingData.mapEachIntervalDataLsLs: for mapEachIntervalData in mapEachIntervalDataLs: self.addInputToMergeJob(reduceJob, \ parentJobLs=[mapEachIntervalData.mapJob]) return returnData
def run(self): """ """ self.setup_run() logDir = "Log" logDirJob = self.addMkDirJob(outputDir=logDir) individualSequenceID2FilePairLs = self.db_main.getIndividualSequenceID2FilePairLs( self.ind_seq_id_ls, data_dir=self.data_dir) for ind_seq_id, FilePairLs in individualSequenceID2FilePairLs.items(): individual_sequence = self.db_main.queryTable( SunsetDB.IndividualSequence).get(ind_seq_id) if individual_sequence is None or individual_sequence.format!='fastq': continue for filePair in FilePairLs: for fileRecord in filePair: relativePath = fileRecord[0] prefix, suffix = utils.getRealPrefixSuffix( os.path.basename(relativePath)) if suffix=='.fastq': filepath = os.path.join(self.data_dir, relativePath) #Do not register the input fastq because InspectBaseQuality # will output directly into self.data_dir. logFile = File(os.path.join(logDir, f'{prefix}.log')) job = self.addDBJob( executable=self.InspectBaseQuality, outputArgumentOption="--logFilename", outputFile=logFile, extraArgumentList=[ '-i', filepath, '--read_sampling_rate', '0.01', '--quality_score_format', individual_sequence.quality_score_format, ], parentJobLs=[logDirJob], transferOutput=True, objectWithDBArguments=self, job_max_memory=5000, walltime=120) self.end_run()
def selectIntervalFromInputFile(self, jobData=None, chromosome=None,\ intervalData=None, mapEachChromosomeData=None,\ passingData=None, transferOutput=False,\ **keywords): """ 2013.11.24 """ inputSuffix = utils.getRealPrefixSuffix(jobData.file.name)[1] outputFile = File(os.path.join(self.mapDirJob.output, \ '%s_%s%s'%(passingData.fileBasenamePrefix, \ intervalData.overlapInterval, inputSuffix))) tabixRetrieveJob = self.addTabixRetrieveJob( executable=self.tabixRetrieve, \ tabixPath=self.tabixPath, \ inputF=jobData.file, outputF=outputFile, \ regionOfInterest=intervalData.overlapInterval, includeHeader=True,\ parentJobLs=jobData.jobLs + [self.mapDirJob], job_max_memory=100, extraDependentInputLs=jobData.fileLs[1:], \ transferOutput=False) return self.constructJobDataFromJob(job=tabixRetrieveJob)
def addWgetJob(self, executable=None, url=None, relativePath=None, username=None, password=None,\ targetFolder=None, logFile=None, cut_dir_number=1, parentJobLs=[], extraDependentInputLs=[], transferOutput=False, \ extraArguments=None, job_max_memory=2000, **keywords): """ 2012.6.27 """ extraArgumentList = ['--user=%s'%(username), '--password=%s'%(password), '--recursive', '--no-parent',\ '--continue', "--reject='index.html*'", "-nc -nH --cut-dirs=%s"%(cut_dir_number), "-P %s"%(targetFolder), \ "%s/%s"%(url, relativePath)] """ # unlike -nd, --cut-dirs does not lose with subdirectories---for instance, with # -nH --cut-dirs=1, a beta/ subdirectory will be placed to xemacs/beta, as one would expect. -c --continue Continue getting a partially-downloaded file. This is useful when you want to finish up a download started by a previous instance of Wget, or by another program. -nc --no-clobber If a file is downloaded more than once in the same directory, Wget's behavior depends on a few options, including -nc. In certain cases, the local file will be clobbered, or overwritten, upon repeated download. In other cases it will be preserved. -nd --no-directories Do not create a hierarchy of directories when retrieving recursively. With this option turned on, all files will get saved to the current directory, without clobbering (if a name shows up more than once, the filenames will get extensions .n). -np --no-parent Do not ever ascend to the parent directory when retrieving recursively. This is a useful option, since it guarantees that only the files below a certain hierarchy will be downloaded. -nH --no-host-directories Disable generation of host-prefixed directories. By default, invoking Wget with -r http://fly.srk.fer.hr/ will create a structure of directories beginning with fly.srk.fer.hr/. This option disables such behavior. -P prefix --directory-prefix=prefix Set directory prefix to prefix. The directory prefix is the directory where all other files and subdirectories will be saved to, i.e. the top of the retrieval tree. The default is . (the current directory) """ if extraArguments: extraArgumentList.append(extraArguments) #wget will add some portion of the URL path to the final output files depending on the cut_dir_number import urlparse url_path_list = urlparse.urlparse(url).path.split('/')[ 1:] #[0] is empty because the path starts with '/' subPath = '/'.join(url_path_list[cut_dir_number:]) if relativePath.find( '/' ) >= 0: #2013.06.26 it's a folder itself. so no straight output. sys.stderr.write("\n\t Warning: item %s is a folder, will not be staged out. You have to manually copy them out of scratch folder.\n"%\ (relativePath)) extraOutputLs = None else: extraOutputLs = [ File( os.path.join(targetFolder, os.path.join(subPath, relativePath))) ] #2012.6.27 don't pass the downloaded outputFile to argument outputFile of addGenericJob() # because it will add "-o" in front of it. "-o" of wget is reserved for logFile. return self.addGenericJob(executable=executable, inputFile=None, outputFile=logFile, \ parentJobLs=parentJobLs, extraDependentInputLs=extraDependentInputLs, \ extraOutputLs=extraOutputLs,\ transferOutput=transferOutput, \ extraArgumentList=extraArgumentList, job_max_memory=job_max_memory)
def build_pegasus_wf(cwl_wf: cwl.Workflow, wf_files: dict, wf_input_str: dict) -> Workflow: log.info("Building Pegasus workflow") wf = Workflow("cwl-converted-pegasus-workflow", infer_dependencies=True) for step in cwl_wf.steps: step_name = get_basename(step.id) log.info("Processing step: {}".format(step_name)) cwl_cmd_ln_tool = (cwl.load_document(step.run) if isinstance( step.run, str) else step.run) job = Job(PurePath(cwl_cmd_ln_tool.baseCommand).name, _id=get_basename(step.id)) # collect current step inputs log.info("Collecting step inputs from {}".format(step_name)) step_inputs = dict() for _input in step.in_: input_id = get_basename(_input.id) step_inputs[input_id] = get_basename(_input.source) log.debug("step_inputs[{}] = {}".format(input_id, step_inputs[input_id])) # add inputs that are of type File for _input in cwl_cmd_ln_tool.inputs: if _input.type == "File": wf_file = File(wf_files[step_inputs[get_name( step.id, _input.id)]]) job.add_inputs(wf_file) log.info("Step: {} added input file: {}".format( step_name, wf_file.lfn)) """ # TODO: handle File[] inputs elif isinstance(_input.type, cwl.CommandInputArraySchema): if _input.type.items == "File": for f in step_inputs[get_name(step.id, _input.id)]: wf_file = File(wf_files[f]) job.add_inputs(wf_file) log.info( "Step: {} added input file: {}".format( step_name, wf_file.lfn ) ) """ # add job outputs that are of type File log.info("Collecting step outputs from {}".format(step_name)) for output in cwl_cmd_ln_tool.outputs: if output.type == "File": wf_file = File(wf_files[get_name(step.id, output.id)]) job.add_outputs(wf_file) log.info("Step: {} added output file: {}".format( step_name, wf_file.lfn)) else: raise NotImplementedError( "Support for output types other than File is in development" ) # add job args args = (cwl_cmd_ln_tool.arguments if cwl_cmd_ln_tool.arguments is not None else list()) # args will be added in the order of their assigned inputBinding def get_input_binding(_input): key = 0 if hasattr(_input, "inputBinding") and hasattr( _input.inputBinding, "position"): key = _input.inputBinding.position return key if key else 0 cwl_cmd_ln_tool_inputs = sorted(cwl_cmd_ln_tool.inputs, key=get_input_binding) for _input in cwl_cmd_ln_tool_inputs: # indicates whether or not input will appear in args if _input.inputBinding is not None: prefix = _input.inputBinding.prefix separate = _input.inputBinding.separate current_arg = "" if prefix: current_arg += prefix if separate: current_arg += " " if _input.type == "File": current_arg += wf_files[step_inputs[get_name( step.id, _input.id)]] elif _input.type == "string": current_arg += wf_input_str[step_inputs[get_name( step.id, _input.id)]] # TODO: provide better support for array inputs being used in args (see https://www.commonwl.org/user_guide/09-array-inputs/index.html) elif isinstance(_input.type, cwl.CommandInputArraySchema): separator = (" " if _input.inputBinding.itemSeparator is None else _input.inputBinding.itemSeparator) if _input.type.items == "File": current_arg += separator.join( wf_files[f] for f in step_inputs[get_name(step.id, _input.id)]) elif _input.type.items == "string": current_arg += separator.join( wf_input_str[step_inputs[get_name( step.id, _input.id)]]) args.append(current_arg) job.add_args(*args) wf.add_jobs(job) log.info("Added job: {}".format(step.run)) log.info("\tcmd: {}".format(job.transformation)) log.info("\targs: {}".format(job.args)) log.info("\tinputs: {}".format([f.lfn for f in job.get_inputs()])) log.info("\toutputs: {}".format([f.lfn for f in job.get_outputs()])) log.info("Building workflow complete. {} jobs added".format(len(wf.jobs))) return wf
def mapEachInterval(self, alignmentData=None, intervalData=None, chromosome=None, VCFJobData=None, passingData=None, reduceBeforeEachAlignmentData=None, mapEachChromosomeData=None, transferOutput=False, \ **keywords): """ 2013.03.31 use VCFJobData to decide whether to add BQSR jobs, called in ShortRead2Alignment.py 2012.9.17 """ returnData = PassingData(no_of_jobs=0) returnData.jobDataLs = [] topOutputDirJob = passingData.topOutputDirJob alignment = alignmentData.alignment bamF = alignmentData.bamF baiF = alignmentData.baiF bamFnamePrefix = passingData.bamFnamePrefix #SNPVCFFile = VCFJobData.file #if SNPVCFFile is None or VCFJobData is None: # #2013.04.09 BQSR requires a VCF input regardless of the chromosome # VCFJobData = self.randomSNPVCFJobDataForBQSR #SNPVCFFile = VCFJobData.file #SNPVCFJobLs = VCFJobData.jobLs if intervalData.file: mpileupInterval = intervalData.interval bcftoolsInterval = intervalData.file else: mpileupInterval = intervalData.interval bcftoolsInterval = intervalData.interval intervalFileBasenameSignature = intervalData.intervalFileBasenameSignature overlapInterval = intervalData.overlapInterval overlapFileBasenameSignature = intervalData.overlapIntervalFileBasenameSignature span = intervalData.span if chromosome is None: chromosome = getattr(passingData, 'chromosome', None) median_depth = getattr(alignment, 'median_depth', 4) readSpace = median_depth * span #base is 4X coverage in 20Mb region => 120 minutes reduceReadsJobWalltime = self.scaleJobWalltimeOrMemoryBasedOnInput( realInputVolume=readSpace, \ baseInputVolume=4*20000000, baseJobPropertyValue=60, \ minJobPropertyValue=60, maxJobPropertyValue=500).value #base is 4X, => 5000M reduceReadsJobMaxMemory = self.scaleJobWalltimeOrMemoryBasedOnInput( realInputVolume=median_depth, \ baseInputVolume=4, baseJobPropertyValue=4000, \ minJobPropertyValue=4000, maxJobPropertyValue=8000).value reduceReadsBamFile = File(os.path.join(topOutputDirJob.output, \ '%s_%s.reduceReads.bam'%\ (bamFnamePrefix, overlapFileBasenameSignature))) #Default downsampling setting is 40 in GATK 2.4.9 # this downsampling happens at the ReadWalker level, #extraArgumentList= ["--downsample_to_coverage 250", "--downsampling_type BY_SAMPLE"] extraArgumentList = ["--downsample_coverage 250"] #this is for #This level of downsampling only happens after the region has been evaluated, # therefore it can be combined with the engine level downsampling. reduceReadsJob = self.addGATKJob(executable=self.ReduceReadsJava, GenomeAnalysisTKJar=self.GenomeAnalysisTK2Jar, \ GATKAnalysisType='ReduceReads',\ inputFile=bamF, inputArgumentOption="-I", refFastaFList=passingData.refFastaFList, inputFileList=None,\ argumentForEachFileInInputFileList=None,\ interval=overlapInterval, outputFile=reduceReadsBamFile, \ parentJobLs=alignmentData.jobLs, transferOutput=False, \ job_max_memory=reduceReadsJobMaxMemory,\ frontArgumentList=None, extraArguments=None, \ extraArgumentList=extraArgumentList, \ extraOutputLs=[], \ extraDependentInputLs=[baiF], no_of_cpus=None, \ walltime=reduceReadsJobWalltime) indexBamJob = self.addBAMIndexJob( BuildBamIndexFilesJava=self.BuildBamIndexFilesJava, \ BuildBamIndexJar=self.BuildBamIndexJar, \ inputBamF=reduceReadsJob.output,\ parentJobLs=[reduceReadsJob], \ transferOutput=False, job_max_memory=3000, \ walltime=max(120, int(reduceReadsJobWalltime/3))) passingData.alignmentJobAndOutputLs.append(PassingData( jobLs=[reduceReadsJob, indexBamJob], \ file=reduceReadsJob.output, fileLs=[reduceReadsJob.output])) return returnData
def doAllAccurityAlignmentJob(self, data_dir=None, normal_bam_bai=None, pair_bam_file_list = None, outputDirPrefix=None, parentJobLs=None, AccurityFolder=None, AccurityFolderJob=None): print("Adding Accurity jobs for %s pair individual sequences ..." % \ (len(pair_bam_file_list)), flush=True) jobLs = [] for pair_bam in pair_bam_file_list: tumor_bam = pair_bam[0] tumor_bam_bai = parentJobLs[1].baiFile normal_bam = pair_bam[1] if tumor_bam is None or normal_bam is None: sys.stderr.write("the pair sample bam file is note exist!!!") exit(2) #tumor_bam_path = os.path.join(data_dir, tumor_bam) #tumor_bai_path = tumor_bam_path + ".bai" #normal_bam_path = os.path.join(data_dir, normal_bam) #normal_bai_path = normal_bam_path + ".bai" Accurity_configure_path = os.path.dirname(self.AccurityPath) + "/configure" outputList = [] sample_id = os.path.basename(tumor_bam.name).strip(".bam") sample_folder = AccurityFolder + "/" + sample_id sample_folder_Job = self.addMkDirJob(outputDir=sample_folder, parentJobLs=parentJobLs.append(AccurityFolderJob)) outputList.append(File(sample_folder + "/infer.out.tsv")) outputList.append(File(sample_folder + "/infer.out.details.tsv")) outputList.append(File(sample_folder + "/auto.tsv")) outputList.append(File(sample_folder + "/cnv.plot.pdf")) outputList.append(File(sample_folder + "/cnv.output.tsv")) outputList.append(File(sample_folder + "/rc_ratio_window_count_smoothed.tsv")) outputList.append(File(sample_folder + "/rc_ratio_no_of_windows_by_chr.tsv")) outputList.append(File(sample_folder + "/cnv.intervel.tsv")) outputList.append(File(sample_folder + "/major_allele_fraction_exp_vs_obs.tsv")) outputList.append(File(sample_folder + "/peak_bounds.tsv")) outputList.append(File(sample_folder + "/rc_logLikelihood.log.tsv")) outputList.append(File(sample_folder + "/rc_ratios_of_peaks_based_on_period_from_autocor.tsv")) outputList.append(File(sample_folder + "/runTime.log.txt")) #tumor_bam_file = self.registerOneInputFile(tumor_bam_path) #tumor_bai_file = self.registerOneInputFile(tumor_bai_path) #normal_bam_file = self.registerOneInputFile(normal_bam_path) #normal_bai_file = self.registerOneInputFile(normal_bai_path) configure_file = self.registerOneInputFile(Accurity_configure_path) argumentList = ["-c", configure_file, "-t", tumor_bam, "-n", normal_bam, "-o", sample_folder, "-d", "1", "-l", "4"] inputFileList = [tumor_bam, tumor_bam_bai, normal_bam, normal_bam_bai, configure_file] job = self.addPurityJobToWorkflow(executable=self.AccurityExecutableFile,\ argumentList=argumentList, \ inputFileList=inputFileList, outputFileList=outputList, \ parentJobLs=[sample_folder_Job], \ job_max_memory=10000, no_of_cpus=8, walltime=400, sshDBTunnel=0) jobLs.append(job) return jobLs
def run(self): """ """ self.setup_run() isq_id2LibrarySplitOrder2FileLs = self.db_main.getISQ_ID2LibrarySplitOrder2FileLs( self.ind_seq_id_ls, data_dir=self.data_dir, filtered=0, ignoreEmptyReadFile=False) to_work_ind_seq_id_set = set() parent_individual_sequence_file_id_set = set() for ind_seq_id, LibrarySplitOrder2FileLs in isq_id2LibrarySplitOrder2FileLs.items( ): parent_individual_sequence = self.db_main.queryTable( SunsetDB.IndividualSequence).get(ind_seq_id) if parent_individual_sequence is not None and parent_individual_sequence.format == 'fastq': """ check if the child individual_sequence already exists in db or not. if it does, what about its files?? if not, go add filtering jobs. """ # 2012.6.8 individual_sequence = self.db_main.copyParentIndividualSequence( parent_individual_sequence=parent_individual_sequence, parent_individual_sequence_id=ind_seq_id, quality_score_format='Standard', filtered=1, data_dir=self.data_dir) library_split_order2filtered_db_entry_ls = self.getLibrarySplitOrder2DBEntryLs( individual_sequence) sequenceOutputDirJob = None filteredReadOutputDirJob = None for key, fileObjLs in LibrarySplitOrder2FileLs.items(): if key in library_split_order2filtered_db_entry_ls: sys.stderr.write( "Warning: this pair of filtered individual_sequence_file(s), " f"{repr(key)}, parent_individual_sequence " f"(id={parent_individual_sequence.id}, {parent_individual_sequence.individual.code}), " f"individual_sequence (id={individual_sequence.id}, {individual_sequence.individual.code}) " "are already in db. skip.\n") continue else: if sequenceOutputDirJob is None: sequenceOutputDir = os.path.join( self.data_dir, individual_sequence.path) sequenceOutputDirJob = self.addMkDirJob( outputDir=sequenceOutputDir) if filteredReadOutputDirJob is None: filteredReadOutputDir = os.path.join( os.path.basename(individual_sequence.path)) filteredReadOutputDirJob = self.addMkDirJob( outputDir=filteredReadOutputDir) # add filter jobs extraDependentInputLs = [] extraOutputLs = [] extraArgumentList = [ "-a", self.adapter, "-j", self.no_of_threads, "--quality-base", self.quality_base, "-m", self.minimum_length ] if self.adapter2 is not None: extraArgumentList.extend(["-A", self.adapter2]) if self.maximum_length is not None: extraArgumentList.extend(["-M", self.maximum_length]) if self.trim_n: extraArgumentList.append("--trim-n") if self.quality_cutoff is not None: extraArgumentList.extend(["-q", self.quality_cutoff]) input_fastq_list = [] for i in range(len(fileObjLs)): fileObj = fileObjLs[i] try: # 2012.7.2 inputFile = self.registerOneInputFile( input_path=fileObj.path, folderName='inputIndividualSequenceFile') except Exception as e: import pdb pdb.set_trace() # take the base filename as the output filename. it'll be in scratch/. outputFname = os.path.join( filteredReadOutputDir, os.path.basename(fileObj.path)) outputFile = File(outputFname) extraDependentInputLs.append(inputFile) extraOutputLs.append(outputFile) if i == 0: # 1st mate input_fastq_list.append(inputFile) extraArgumentList.extend(["-o", outputFile]) elif i == 1: # 2nd mate input_fastq_list.append(inputFile) extraArgumentList.extend(["-p", outputFile]) else: sys.stderr.write( "Error: mate %s appeared in paired-end data (individualSequenceID=%s).\n" % (i + 1, ind_seq_id)) sys.exit(4) extraArgumentList.extend(input_fastq_list) filterShortRead_job = self.addFilterReadJob( executable=self.cutadapt, extraOutputLs=extraOutputLs, parentJobLs=[filteredReadOutputDirJob], job_max_memory=2000, walltime=120, extraDependentInputLs=extraDependentInputLs, extraArgumentList=extraArgumentList, no_of_cpus=self.no_of_threads, transferOutput=False) for fileObj, outputFile in zip(fileObjLs, extraOutputLs): logFile = File( '%s_%s.register.log' % (individual_sequence.id, fileObj.db_entry.id)) addFilteredSequences2DB_job = self.addAddFilteredSequences2DB_job( executable=self.AddFilteredSequences2DB, inputFile=outputFile, individual_sequence_id=individual_sequence.id, outputDir=sequenceOutputDir, logFile=logFile, parent_individual_sequence_file_id=fileObj. db_entry.id, parentJobLs=[ sequenceOutputDirJob, filterShortRead_job ], commit=self.commit, extraDependentInputLs=None, transferOutput=True, sshDBTunnel=self.needSSHDBTunnel) to_work_ind_seq_id_set.add(ind_seq_id) parent_individual_sequence_file_id_set.add( fileObj.db_entry.id) sys.stderr.write( f"{self.no_of_jobs} jobs, {len(to_work_ind_seq_id_set)} individual_sequence entries, " f"{len(parent_individual_sequence_file_id_set)} parent_individual_sequence_file_id s.\n" ) self.end_run()
pipe2File = pegaflow.registerExecutable(wflow, pipe2File_path, args.site_handler, cluster_size=args.cluster_size) mergeWC = pegaflow.registerExecutable(wflow, pipe2File_path, args.site_handler, executableName='mergeWC', cluster_size=args.cluster_size) sleep = pegaflow.registerExecutable(wflow, "/bin/sleep", args.site_handler, cluster_size=args.cluster_size) mergedOutputFile = File("merged.txt") # request 500MB memory, 30 minutes run time (walltime). mergeJob = pegaflow.addJob2workflow( wflow, mergeWC, argv=[mergedOutputFile, '/bin/cat'], input_file_list=None, output_file_transfer_list=[mergedOutputFile], output_file_notransfer_list=None, job_max_memory=500, walltime=30) mkdir = pegaflow.registerExecutable(wflow, '/bin/mkdir', args.site_handler) outputDir = 'output' outputDirJob = pegaflow.addMkDirJob(wflow, mkdir, outputDir)
def run(self): ## setup_run() will call registerExecutables() self.setup_run() # Register all .py files from the input folder # self.registerOneInputFile('/tmp/abc.txt') can be used to register # one input file. inputData = self.registerFilesOfInputDir( inputDir=self.input_path, input_site_handler=self.input_site_handler, inputSuffixSet=self.inputSuffixSet, pegasusFolderName='input') # Pegasus jobs do NOT allow pipes. So use pipe2File (already # registered in Workflow.py). # register wc and cat as they will be used by pipe2File. wcCommand = self.registerOneExecutableAsFile(path="/usr/bin/wc") catCommand = self.registerOneExecutableAsFile(path="/bin/cat") mergedOutputFile = File("merged.txt") # request 500MB memory, 30 minutes run time (walltime). # executable=self.mergeWC tells this function to use a different # executable. # In order to give this job a different name. # If executable=None or not given, self.pipe2File is used. mergeJob = self.addPipe2FileJob(executable=self.mergeWC, commandFile=catCommand, outputFile=mergedOutputFile, transferOutput=True, job_max_memory=500, walltime=30) outputDir = 'output' outputDirJob = self.addMkDirJob(outputDir) for jobData in inputData.jobDataLs: outputFile = File( os.path.join( outputDir, f'{os.path.basename(jobData.file.name)}.wc.output.txt')) ## wc each input file # Argument "executable" is not given, use self.pipe2File. wcJob = self.addPipe2FileJob(commandFile=wcCommand, outputFile=outputFile, parentJob=None, parentJobLs=[outputDirJob], extraArgumentList=[jobData.file], extraDependentInputLs=[jobData.file], extraOutputLs=None, transferOutput=False) # add wcJob.output (outputFile passed to addPipe2FileJob() above) # as the input of mergeJob. # It appends input to the end of a job's exising arguments). # wcJob.output will be a dependent input of mergeJob. # addInputToMergeJob() also adds wcJob as a parent of mergeJob. self.addInputToMergeJob(mergeJob=mergeJob, inputF=wcJob.output, inputArgumentOption="", parentJobLs=[wcJob]) # a sleep job to slow down the workflow for 30 seconds # sleepJob has no output. sleepJob = self.addGenericJob(executable=self.sleep, extraArgumentList=[30]) # add sleepJob as mergeJob's parent. self.addInputToMergeJob(mergeJob=mergeJob, parentJobLs=[sleepJob]) # end_run() will output the DAG to output_path self.end_run()
def mapEachInterval(self, VCFJobData=None, passingData=None, transferOutput=False, **keywords): """ use VCFJobData #. extract flanking sequences from the input VCF (ref sequence file => contig ref sequence) #. blast them #. run FindSNPPositionOnNewRefFromFlankingBlastOutput.py #. where hit length match query length, and no of mismatches <=2 => good => infer new coordinates #. output a mapping file between old SNP and new SNP coordinates. #. reduce this thing by combining everything #. make a new VCF file based on the input split VCF file #. (replace contig ID , position with the new one's, remove the header part regarding chromosomes or replace it) """ returnData = PassingData(no_of_jobs=0) returnData.jobDataLs = [] topOutputDirJob = passingData.topOutputDirJob mapDirJob = passingData.mapDirJob reduceOutputDirJob = passingData.reduceOutputDirJob intervalFileBasenamePrefix = passingData.intervalFileBasenamePrefix jobData = passingData.jobData VCFFile = VCFJobData.file splitVCFJob = passingData.mapEachVCFData.splitVCFJob chromosome = passingData.chromosome # a flanking sequence extraction job #noOfIndividuals realInputVolume = passingData.noOfIndividuals * passingData.span baseInputVolume = 600 * 2000 #600 individuals at 2000 sites #base is 200 individual X 2Mb region => 120 minutes walltime = self.scaleJobWalltimeOrMemoryBasedOnInput( realInputVolume=realInputVolume, baseInputVolume=baseInputVolume, baseJobPropertyValue=60, minJobPropertyValue=60, maxJobPropertyValue=1200).value #base is 4X, => 5000M job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput( realInputVolume=realInputVolume, baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, minJobPropertyValue=4000, maxJobPropertyValue=8000).value outputFnamePrefix = os.path.join( mapDirJob.output, '%s.sameSite.concordance' % (intervalFileBasenamePrefix)) outputFile = File('%s.tsv' % (outputFnamePrefix)) returnData.mapJob = self.addAbstractMapperLikeJob( executable=self.CalculateSameSiteConcordanceInVCF, inputF=VCFFile, outputF=outputFile, parentJobLs=[mapDirJob] + VCFJobData.jobLs, transferOutput=transferOutput, job_max_memory=job_max_memory, walltime=walltime) return returnData
def addDownsamplejob(self, data_dir=None, idDict=None, DownSamplePrefix=None, downSampleJava=None, downSampleJar=None, transferOutput=False): AccurityFolder = "AccurityResult" AccurityFolderJob = self.addMkDirJob(outputDir=AccurityFolder) sys.stderr.write("Adding downsample jobs for %s individual sequences ..." % (len(idDict))) SampleFolder = "%swithSeed1.0" % (DownSamplePrefix) SampleFolderJob = self.addMkDirJob(outputDir=SampleFolder) alignNormal = self.db_main.queryTable(SunsetDB.IndividualAlignment).get(idDict['normalFile']) alignNormalFilePath = os.path.join(data_dir, alignNormal.path) inputNormalBamFile = self.registerOneInputFile(alignNormalFilePath) coverageNormal = int(alignNormal.mean_depth) #alignNormalIndiv = self.db_main.queryTable(SunsetDB.IndividualSequence).get(alignNormal.ind_seq_id) #coverageNormal = int(alignNormalIndiv.coverage) alignTumor = self.db_main.queryTable(SunsetDB.IndividualAlignment).get(idDict['tumorFile']) alignTumorFilePath = os.path.join(data_dir, alignTumor.path) inputTumorBamFile = self.registerOneInputFile(alignTumorFilePath) coverageTumor = int(alignTumor.mean_depth) #alignTumorIndiv = self.db_main.queryTable(SunsetDB.IndividualSequence).get(alignTumor.ind_seq_id) #coverageTumor = int(alignTumorIndiv.coverage) job_max_memory = "5000" walltime = '600' for i in range(1,10): jobLs = [] pair_bam_file_list = [] probNormal = float(i) / float(coverageNormal) probTumor = float(10 - i) / float(coverageTumor) outputNormalFile = File(os.path.join(SampleFolder, str(probNormal) + "_normal_downsample.bam")) outputTumorFile = File(os.path.join(SampleFolder, str(probTumor) + "_tumor_downsample.bam")) mergeJobAndOutputLs = [] normal_down_sample_job = self.addGenericJavaJob( executable=downSampleJava, jarFile=downSampleJar, inputFile=inputNormalBamFile, inputArgumentOption="INPUT=", inputFileList=None, argumentForEachFileInInputFileList=None, outputFile=outputNormalFile, outputArgumentOption="OUTPUT=", parentJobLs=[SampleFolderJob], transferOutput=False, job_max_memory=job_max_memory, frontArgumentList=['DownsampleSam'], extraArguments=None, extraArgumentList=['PROBABILITY=' + str(probNormal), \ 'RANDOM_SEED=','1' ,\ 'STRATEGY=','ConstantMemory', \ 'VALIDATION_STRINGENCY=','LENIENT' ], extraOutputLs=None, \ extraDependentInputLs=None, no_of_cpus=None, walltime=walltime, sshDBTunnel=None) mergeJobAndOutputLs.append(PassingData( jobLs=[normal_down_sample_job], file=outputNormalFile)) tumor_down_sample_job = self.addGenericJavaJob( executable=downSampleJava, jarFile=downSampleJar, inputFile=inputTumorBamFile, inputArgumentOption="INPUT=", inputFileList=None, argumentForEachFileInInputFileList=None, outputFile=outputTumorFile, outputArgumentOption="OUTPUT=", parentJobLs=[SampleFolderJob], transferOutput=False, job_max_memory=job_max_memory, frontArgumentList=['DownsampleSam'], extraArguments=None, extraArgumentList=['PROBABILITY=' , str(probTumor), \ 'RANDOM_SEED=','1', \ 'STRATEGY=', 'ConstantMemory', \ 'VALIDATION_STRINGENCY=' ,'LENIENT' ], extraOutputLs=None, \ extraDependentInputLs=None, no_of_cpus=None, walltime=walltime, sshDBTunnel=None) mergeJobAndOutputLs.append(PassingData(jobLs=[tumor_down_sample_job], file=outputTumorFile)) puritySampleFolder = "puritySample" SampleFolderJob = self.addMkDirJob(outputDir=puritySampleFolder) purity = str((10-i) * 0.1) purityDir = "purity" + str(purity) purityFolderJob = self.addMkDirJob(outputDir=os.path.join(puritySampleFolder,purityDir)) mergedBamFile = File(os.path.join(puritySampleFolder,purityDir, "purity_"+ purity + ".bam")) baseCoverage = 4 * 3000000000 # baseline minMergeAlignmentWalltime = 240 # in minutes, 4 hours, when coverage is defaultCoverage maxMergeAlignmentWalltime = 2980 # in minutes, 2 days minMergeAlignmentMaxMemory = 8000 # in MB, when coverage is defaultCoverage maxMergeAlignmentMaxMemory = 21000 # in MB mergeAlignmentWalltime = self.scaleJobWalltimeOrMemoryBasedOnInput( realInputVolume=max(i, 10-i) * 3000000000, baseInputVolume=baseCoverage, baseJobPropertyValue=minMergeAlignmentWalltime, minJobPropertyValue=minMergeAlignmentWalltime, maxJobPropertyValue=maxMergeAlignmentWalltime).value mergeAlignmentMaxMemory = self.scaleJobWalltimeOrMemoryBasedOnInput( realInputVolume=max(i, 10-i) * 3000000000, \ baseInputVolume=baseCoverage, baseJobPropertyValue=minMergeAlignmentMaxMemory, minJobPropertyValue=minMergeAlignmentMaxMemory, maxJobPropertyValue=maxMergeAlignmentMaxMemory).value MergeJob, bamIndexJob = self.addAlignmentMergeJob( alignmentJobAndOutputLs=mergeJobAndOutputLs, outputBamFile=mergedBamFile, needBAMIndexJob=True, parentJobLs=[SampleFolderJob, purityFolderJob], transferOutput=transferOutput, job_max_memory=mergeAlignmentMaxMemory, walltime=mergeAlignmentWalltime) normal_part_refer = self.registerOneInputFile( inputFname="/y/Sunset/workflow/real_data/downsample/normal_0.2.bam", folderName=os.path.join(puritySampleFolder,purityDir)) normal_bam_bai = self.registerOneInputFile( inputFname="/y/Sunset/workflow/real_data/downsample/normal_0.2.bam.bai", folderName=os.path.join(puritySampleFolder,purityDir)) pair_bam_file_list.append([mergedBamFile, normal_part_refer]) AccurityJob = self.doAllAccurityAlignmentJob(data_dir=None, normal_bam_bai=normal_bam_bai, pair_bam_file_list=pair_bam_file_list,\ outputDirPrefix=None, parentJobLs=[MergeJob, bamIndexJob], AccurityFolder=AccurityFolder, AccurityFolderJob=AccurityFolderJob) return jobLs
def addJobs(self, inputData=None, outputDirPrefix="", ntDatabaseFileList=None, noOfTotalSequences=None, transferOutput=True, makeBlastDBJob=None): """ 2012.5.24 """ sys.stderr.write("Adding blast jobs for %s input ... "%(len(inputData.jobDataLs))) no_of_jobs= 0 topOutputDir = "%sBlast"%(outputDirPrefix) topOutputDirJob = self.addMkDirJob(outputDir=topOutputDir) no_of_jobs += 1 allBlastResultFile = File(os.path.join(topOutputDir, 'blast.tsv')) allBlastMergeJob = self.addStatMergeJob( statMergeProgram=self.mergeSameHeaderTablesIntoOne, \ outputF=allBlastResultFile, transferOutput=transferOutput, parentJobLs=[topOutputDirJob]) no_of_jobs += 1 ntDatabaseFile = ntDatabaseFileList[0] returnData = PassingData() returnData.jobDataLs = [] for jobData in inputData.jobDataLs: inputF = jobData.output outputFnamePrefix = os.path.join(topOutputDir, os.path.splitext(os.path.basename(inputF.name))[0]) splitFastaJob = self.addSplitFastaFileJob(executable=self.SplitFastaFile, inputFile=inputF, outputFnamePrefix=outputFnamePrefix, \ noOfSequencesPerSplitFile=self.blockSize, filenameSuffix=".fasta", noOfTotalSequences=noOfTotalSequences,\ parentJobLs=jobData.jobLs + [topOutputDirJob], extraDependentInputLs=None, transferOutput=False, \ extraArguments=None, job_max_memory=500) no_of_jobs += 1 for splitFastaOutput in splitFastaJob.outputList: outputFile = File('%s.tsv'%(splitFastaOutput.name)) blastJob = self.addBlastWrapperJob(executable=self.BlastWrapper, inputFile=splitFastaOutput, outputFile=outputFile, outputFnamePrefix=splitFastaOutput.name , databaseFile=ntDatabaseFile, maxNoOfMismatches=self.maxNoOfMismatches, minNoOfIdentities=self.minNoOfIdentities, minIdentityPercentage=self.minIdentityPercentage, blastallPath=self.blastallPath, parentJobLs=[splitFastaJob, makeBlastDBJob], extraDependentInputLs=ntDatabaseFileList, transferOutput=False, \ extraArguments=None, job_max_memory=1000) #add output to some reduce job self.addInputToMergeJob(allBlastMergeJob, \ inputF=blastJob.output, parentJobLs=[blastJob]) no_of_jobs += 1 sys.stderr.write("%s jobs. Done.\n"%(no_of_jobs)) #include the tfam (outputList[1]) into the fileLs returnData.jobDataLs.append(PassingData(jobLs=[allBlastMergeJob], file=allBlastResultFile, \ fileLs=[allBlastResultFile])) return returnData
def addJobs(self, inputData=None, topOutputDir="output", needSSHDBTunnel=0): """ 2012.3.14 """ sys.stderr.write("Adding read counting jobs on %s input ..."%\ (len(inputData.jobDataLs))) no_of_jobs = 0 if topOutputDir: topOutputDirJob = self.addMkDirJob(outputDir=topOutputDir) no_of_jobs += 1 else: topOutputDirJob = None finalReduceFile = File( os.path.join(topOutputDir, 'read_base_count.tsv')) readBaseCountMergeJob = self.addStatMergeJob( statMergeProgram=self.mergeSameHeaderTablesIntoOne, outputF=finalReduceFile, transferOutput=True, extraArguments=None, parentJobLs=[topOutputDirJob]) logFile = File(os.path.join(topOutputDir, 'PutReadBaseCountIntoDB.log')) putCountIntoDBJob = self.addPutReadBaseCountIntoDBJob( executable=self.PutReadBaseCountIntoDB, inputFileLs=[finalReduceFile], logFile=logFile, commit=self.commit, parentJobLs=[readBaseCountMergeJob], extraDependentInputLs=[], transferOutput=True, extraArguments=None, job_max_memory=10, sshDBTunnel=needSSHDBTunnel) no_of_jobs += 2 for jobData in inputData.jobDataLs: #add the read count job outputFile = File(os.path.join(topOutputDir, 'read_count_isq_%s_isqf_%s.tsv'%\ (jobData.isq_id, jobData.isqf_id))) readCountJob = self.addCountFastqReadBaseCountJob( executable=self.CountFastqReadBaseCount, inputFile=jobData.output, outputFile=outputFile, isq_id=jobData.isq_id, isqf_id=jobData.isqf_id, parentJobLs=jobData.jobLs + [topOutputDirJob], extraDependentInputLs=None, transferOutput=False, extraArguments=None, job_max_memory=10, no_of_cpus=4) no_of_jobs += 1 self.addInputToMergeJob(readBaseCountMergeJob, inputF=readCountJob.output, parentJobLs=[readCountJob]) sys.stderr.write("%s jobs.\n" % (no_of_jobs)) return putCountIntoDBJob
def preReduce(self, outputDirPrefix="", passingData=None, transferOutput=True, **keywords): """ 2012.9.17 """ returnData = ParentClass.preReduce(self, outputDirPrefix=outputDirPrefix,\ passingData=passingData, transferOutput=transferOutput, **keywords) #add a stat merge job and a genome wide plot job outputFile = File(os.path.join(self.reduceOutputDirJob.output, 'locusLiftOverProbability.tsv')) self.reduceJob = self.addStatMergeJob( statMergeProgram=self.mergeSameHeaderTablesIntoOne, \ outputF=outputFile, \ parentJobLs=[self.reduceOutputDirJob], extraDependentInputLs=None, transferOutput=False) sortProbabilityFile = File(os.path.join(self.reduceOutputDirJob.output, 'locusLiftOverProbability.sorted.tsv')) sortProbabilityJob = self.addSortJob(inputFile=self.reduceJob.output, outputFile=sortProbabilityFile, \ parentJobLs=[self.reduceJob], \ extraOutputLs=None, transferOutput=False, \ extraArgumentList=["""-k1,1 -k2,3n """], \ sshDBTunnel=None,\ job_max_memory=4000, walltime=120) #2013.12.3 Tab delimiter syntax (-t$'\t') is removed because it can't be passed correctly. #2013.12.3 Tried -t "`/bin/echo -e '\t'`" as well, didn't work either. # However since each column field doesn't contain blank, # it is fine to just use the default separator (non-blank to blank). returnData.jobDataLs.append(self.constructJobDataFromJob(sortProbabilityJob)) outputFile = File(os.path.join(self.plotDirJob.output, 'locusLiftOverProbability.png')) self.addPlotGenomeWideDataJob(inputFileList=None, inputFile=self.reduceJob.output, outputFile=outputFile,\ whichColumn=None, whichColumnHeader="mapPvalue", whichColumnPlotLabel="mapPvalue", \ logX=None, logY=2, valueForNonPositiveYValue=-1, \ xScaleLog=None, yScaleLog=None,\ missingDataNotation='NA',\ xColumnPlotLabel="genomePosition", xColumnHeader="oldStart", \ xtickInterval=0,\ drawCentromere=True, chrColumnHeader="oldChromosome", \ minChrLength=None, minNoOfTotal=None, maxNoOfTotal=None, \ figureDPI=100, formatString=".", ylim_type=2, samplingRate=1, logCount=False, need_svg=True,\ tax_id=self.ref_genome_tax_id, sequence_type_id=self.ref_genome_sequence_type_id, chrOrder=1,\ inputFileFormat=1, outputFileFormat=None,\ parentJobLs=[self.reduceJob], \ extraDependentInputLs=None, \ extraArguments=None, extraArgumentList=None, \ transferOutput=True, job_max_memory=1000, sshDBTunnel=self.needSSHDBTunnel) #xtickInterval=0 means no ticks on x-axis. outputFile = File( os.path.join(self.plotDirJob.output, 'locusLiftOverProbabilityHist.png')) #no spaces or parenthesis or any other shell-vulnerable letters in the x # or y axis labels (whichColumnPlotLabel, xColumnPlotLabel) self.addDrawHistogramJob(executable=self.DrawHistogram, inputFileList=[self.reduceJob.output], \ outputFile=outputFile, \ whichColumnHeader="mapPvalue", whichColumnPlotLabel="minusLogLiftOverPvalue", \ xScaleLog=0, yScaleLog=1, \ logCount=False, logY=2, valueForNonPositiveYValue=50,\ minNoOfTotal=10,\ figureDPI=100, samplingRate=1,legendType=1, \ parentJobLs=[self.plotDirJob, self.reduceJob], \ extraDependentInputLs=None, \ transferOutput=True, job_max_memory=8000) return returnData
def addAllJobs(self, \ data_dir=None, \ outputDirPrefix="", transferOutput=True, **keywords): """ 2013.2.27 run ms estimate parameters from ms ms2SLiM SLiM forward simulator with estimated ms-parameters or take the output of ms as input SLiM2PolymorphismTableFile AddPopGenSimulation2DB.py """ sys.stderr.write("Adding jobs for pop-gen simulation #jobs=%s... \n"%\ (self.no_of_jobs)) returnData = PassingData() returnData.jobDataLs = [] passingData = PassingData(fileBasenamePrefix=None, \ outputDirPrefix=outputDirPrefix, \ jobData=None,\ preReduceReturnData=None,\ association_group_key2orderIndex = {},\ association_group_key2resultList = {},\ association_group_key2reduceAssociationPeakJobMatrix = {},\ association_group_key2countAssociationLocusJobList = {},\ resultID2defineLandscapeJobData = {}, ) preReduceReturnData = self.preReduce(outputDirPrefix=outputDirPrefix, \ passingData=passingData, transferOutput=False,\ **keywords) mapDirJob = preReduceReturnData.mapDirJob plotOutputDirJob = preReduceReturnData.plotOutputDirJob countAssociationLocusOutputDirJob = preReduceReturnData.countAssociationLocusOutputDirJob reduceOutputDirJob = preReduceReturnData.reduceOutputDirJob passingData.preReduceReturnData = preReduceReturnData #add output pedigree job for i in range(self.noOfReplicates): popGenSimulationFolderJob = self.addMkDirJob(outputDir=os.path.join(mapDirJob.output, 'popGenSim%s'%(i)), \ parentJobLs=[mapDirJob]) #pending user choice, use ms/sfs-code/slim/ms & slim combination msOutputFile = File(os.path.join(popGenSimulationFolderJob.output, \ 'sim%s_msOutput.txt.gz'%(i))) popSimulationJob = self.addMSSimulationJob(outputFile=msOutputFile, \ recombinationRate=self.recombinationRate, mutationRate=self.mutationRate, \ initialEffectivePopulationSize=self.initialEffectivePopulationSize, \ otherParametersPassedToPopGenSimulator=self.otherParametersPassedToPopGenSimulator, \ sampleSize=self.sampleSize, noOfLociToSimulate=self.noOfLociToSimulate, \ simulateLocusLengthList=self.simulateLocusLengthList, \ parentJobLs=[popGenSimulationFolderJob], \ extraDependentInputLs=None, extraOutputLs=None, \ transferOutput=False, extraArguments=None, extraArgumentList=None, \ job_max_memory=2000, walltime=180) #. convert ms pop-gen output 2 polymorphism-table file msOutputHDF5File = File(os.path.join(popGenSimulationFolderJob.output, \ 'sim%s_msOutput.h5'%(i))) msOutput2PolymorphismTableFileJob = self.addGenericJob(executable=self.msOutput2PolymorphismTableFile, \ inputFile=popSimulationJob.output, \ outputFile=msOutputHDF5File,\ parentJob=None, parentJobLs=[popGenSimulationFolderJob, popSimulationJob], \ extraDependentInputLs=None, extraOutputLs=None, transferOutput=False, \ frontArgumentList=None, \ extraArguments=None, \ extraArgumentList=None, job_max_memory=2000, \ no_of_cpus=None, walltime=None) #. add polymorphism-table file to db logFile = File( os.path.join(popGenSimulationFolderJob.output, "sim%s_2DB.log" % (i))) extraArgumentList = ["--r %s"%self.recombinationRate, "--rho %s"%popSimulationJob.rho, "--mu %s"%self.mutationRate,\ "--theta %s"%popSimulationJob.theta, "--n0 %s"%self.initialEffectivePopulationSize,\ "--no_of_populations 1", "--no_of_chromosomes %s"%self.sampleSize,\ "--chromosome_length %s"%popSimulationJob.locusLength,\ "--replicate_index %s"%(i)] """ extraArgumentList.append("--parent_pop_gen_simulation_type_id %s"%self.parent_pop_gen_simulation_type_id) """ simulation2DBJob = self.addPutStuffIntoDBJob(executable=self.AddPopGenSimulation2DB, \ inputFileList=[msOutput2PolymorphismTableFileJob.output], \ logFile=logFile, commit=True, \ parentJobLs=[popGenSimulationFolderJob, msOutput2PolymorphismTableFileJob], \ extraDependentInputLs=None, transferOutput=True, extraArguments=None, \ extraArgumentList=extraArgumentList,\ job_max_memory=10, sshDBTunnel=self.needSSHDBTunnel)
def addAddRG2BamJobsAsNeeded(self, alignmentDataLs=None, transferOutput=True): """ 2011-9-15 add a read group only when the alignment doesn't have it according to db record DBVervet.pokeBamReadGroupPresence() from misc.py helps to fill in db records if it's unclear. 2011-9-14 The read-group adding jobs will have a "move" part that overwrites the original bam&bai if site_handler and input_site_handler is same. For those alignment files that don't need to. It doesn't matter. pegasus will transfer/symlink them. """ print(f"Adding add-read-group2BAM jobs for {len(alignmentDataLs)} " f"alignments if read group is not detected ... ", flush=True) job_max_memory = 3500 #in MB addRG2BamDir = None addRG2BamDirJob = None no_of_rg_jobs = 0 returnData = [] for alignmentData in alignmentDataLs: alignment = alignmentData.alignment parentJobLs = alignmentData.jobLs bamF = alignmentData.bamF baiF = alignmentData.baiF if alignment.read_group_added!=1: if addRG2BamDir is None: addRG2BamDir = "addRG2Bam" addRG2BamDirJob = self.addMkDirJob(outputDir=addRG2BamDir) # add RG to this bam sequencer = alignment.individual_sequence.sequencer read_group = alignment.getReadGroup() if sequencer=='454': platform_id = 'LS454' elif sequencer=='GA': platform_id = 'ILLUMINA' else: platform_id = 'ILLUMINA' outputRGSAM = File(os.path.join(addRG2BamDir,\ os.path.basename(alignment.path))) addRGJob:Job = self.addJavaJob(self.AddOrReplaceReadGroupsJava, jarFile=self.AddOrReplaceReadGroupsJar, inputArgumentOption="INPUT=", inputFile=bamF, outputArgumentOption="OUTPUT=", outputFile=outputRGSAM, transferOutput=transferOutput, extraArgumentList=['RGID=%s'%(read_group), 'RGLB=%s'%(platform_id), 'RGPL=%s'%(platform_id), 'RGPU=%s'%(read_group), 'RGSM=%s'%(read_group), 'SORT_ORDER=coordinate', "VALIDATION_STRINGENCY=LENIENT"], parentJobLs=parentJobLs, extraDependentInputLs=[baiF], job_max_memory=job_max_memory) #(adding the SORT_ORDER doesn't do sorting but it marks the header # as sorted so that BuildBamIndexJar won't fail.) if self.tmpDir: addRGJob.add_args("TMP_DIR=%s"%self.tmpDir) if addRG2BamDirJob: self.add_dependency(addRGJob, parents=[addRG2BamDirJob]) index_sam_job = self.addBAMIndexJob( inputBamF=outputRGSAM, parentJobLs=[addRGJob], transferOutput=transferOutput, javaMaxMemory=2000) newAlignmentData = PassingData(alignment=alignment) newAlignmentData.jobLs = [index_sam_job, addRGJob] newAlignmentData.bamF = index_sam_job.bamFile newAlignmentData.baiF = index_sam_job.baiFile no_of_rg_jobs += 1 else: newAlignmentData = alignmentData returnData.append(newAlignmentData) print(f"{no_of_rg_jobs} alignments need read-group addition.", flush=True) return returnData
def reduceAfterEachAlignment(self, passingData=None, transferOutput=False, data_dir=None, **keywords): """ """ returnData = PassingData(no_of_jobs=0) returnData.jobDataLs = [] alignmentJobAndOutputLs = getattr(passingData, 'alignmentJobAndOutputLs', []) bamFnamePrefix = passingData.bamFnamePrefix topOutputDirJob = passingData.topOutputDirJob individual_alignment = passingData.individual_alignment reduceOutputDirJob = passingData.reduceOutputDirJob if len(alignmentJobAndOutputLs) > 0: #2012.3.29 merge alignment output only when there is something to merge! #2013.04.09 create a new child alignment local_realigned =1, etc. new_individual_alignment = self.db.copyParentIndividualAlignment( parent_individual_alignment_id=individual_alignment.id,\ data_dir=self.data_dir, local_realigned=individual_alignment.local_realigned,\ reduce_reads=1) # replace read_group with the new one to each alignment job newAlignmentJobAndOutputLs = [] for alignmentJobAndOutput in alignmentJobAndOutputLs: # add a AddReadGroup job alignmentJob, indexAlignmentJob = alignmentJobAndOutput.jobLs[: 2] fileBasenamePrefix = os.path.splitext( alignmentJob.output.name)[0] outputRGBAM = File("%s.isq_RG.bam" % (fileBasenamePrefix)) # needBAMIndexJob=False because addAlignmentMergeJob() # does not need .bai. addRGJob = self.addReadGroupJob( individual_alignment=new_individual_alignment, inputBamFile=alignmentJob.output, outputBamFile=outputRGBAM, needBAMIndexJob=False, parentJobLs=[alignmentJob, indexAlignmentJob], extraDependentInputLs=alignmentJob.outputLs[1:], job_max_memory=2500, transferOutput=False) newAlignmentJobAndOutputLs.append( PassingData(jobLs=[addRGJob], file=addRGJob.output)) mergedBamFile = File( os.path.join(reduceOutputDirJob.output, '%s.merged.bam' % (bamFnamePrefix))) alignmentMergeJob, bamIndexJob = self.addAlignmentMergeJob( alignmentJobAndOutputLs=newAlignmentJobAndOutputLs, outputBamFile=mergedBamFile, needBAMIndexJob=True, parentJobLs=[reduceOutputDirJob], transferOutput=False) #2012.9.19 add/copy the alignment file to db-affliated storage #add the metric file to AddAlignmentFile2DB.py as well # (to be moved into db-affiliated storage) logFile = File( os.path.join(reduceOutputDirJob.output, '%s_2db.log' % (bamFnamePrefix))) alignment2DBJob = self.addAlignmentFile2DBJob( executable=self.AddAlignmentFile2DB, inputFile=alignmentMergeJob.output, baiFile=bamIndexJob.baiFile, individual_alignment_id=new_individual_alignment.id, logFile=logFile, data_dir=data_dir, otherInputFileList=None, parentJobLs=[alignmentMergeJob, bamIndexJob], transferOutput=transferOutput, job_max_memory=2000, sshDBTunnel=self.needSSHDBTunnel, commit=True) self.no_of_jobs += 1 returnData.jobDataLs.append( PassingData(jobLs=[alignment2DBJob], file=alignment2DBJob.logFile, \ fileLs=[alignment2DBJob.logFile])) return returnData
def addJobs(self, inputData=None, db_main=None, genotypeMethodShortName=None, commit=None,\ data_dir=None, checkEmptyVCFByReading=False, transferOutput=True,\ maxContigID=None, outputDirPrefix="", needSSHDBTunnel=False): """ 2012.5.9 """ sys.stderr.write("Adding VCF2DB jobs for %s vcf files ... " % (len(inputData.jobDataLs))) topOutputDir = "%sVCF2DB" % (outputDirPrefix) topOutputDirJob = self.addMkDirJob(outputDir=topOutputDir) firstVCFFile = inputData.jobDataLs[0].vcfFile logFile = File(os.path.join(topOutputDir, 'AddGenotypeMethod2DB.log')) addGM2DBJob = self.addAddGenotypeMethod2DBJob( executable=self.AddGenotypeMethod2DB, inputFile=firstVCFFile, \ genotypeMethodShortName=genotypeMethodShortName,\ logFile=logFile, data_dir=data_dir, commit=commit, parentJobLs=None, extraDependentInputLs=None, transferOutput=True, \ extraArguments=None, job_max_memory=10, sshDBTunnel=needSSHDBTunnel) updateGMlogFile = File(os.path.join(topOutputDir, 'updateGM.log')) updateGMNoOfLociJob = self.addUpdateGenotypeMethodNoOfLociJob( executable=self.UpdateGenotypeMethodNoOfLoci, \ genotypeMethodShortName=genotypeMethodShortName,\ logFile=updateGMlogFile, data_dir=data_dir, commit=commit, parentJobLs=[topOutputDirJob], \ extraDependentInputLs=[], transferOutput=True, \ extraArguments=None, job_max_memory=20, sshDBTunnel=needSSHDBTunnel) returnData = PassingData() returnData.jobDataLs = [] for jobData in inputData.jobDataLs: inputF = jobData.vcfFile if maxContigID: contig_id = self.getContigIDFromFname(inputF.name) try: contig_id = int(contig_id) if contig_id > maxContigID: #skip the small contigs continue except: sys.stderr.write('Except type: %s\n' % repr(sys.exc_info())) import traceback traceback.print_exc() logFile = File( os.path.join( topOutputDir, 'AddVCFFile2DB_%s.log' % (self.getChrFromFname(inputF.name)))) addVCFJob = self.addAddVCFFile2DBJob( executable=self.AddVCFFile2DB, inputFile=inputF, genotypeMethodShortName=genotypeMethodShortName,\ logFile=logFile, format="VCF", data_dir=data_dir, checkEmptyVCFByReading=checkEmptyVCFByReading, commit=commit, \ parentJobLs=[addGM2DBJob]+jobData.jobLs, extraDependentInputLs=[], transferOutput=True, \ extraArguments=None, job_max_memory=1000, sshDBTunnel=needSSHDBTunnel) self.add_dependency(updateGMNoOfLociJob, parents=[addVCFJob]) sys.stderr.write("%s jobs.\n" % (self.no_of_jobs)) #include the tfam (outputList[1]) into the fileLs returnData.jobDataLs.append(PassingData(jobLs=[updateGMNoOfLociJob], file=updateGMlogFile, \ fileLs=[updateGMlogFile])) return returnData
def reduceAfterEachAlignment(self, passingData=None, transferOutput=False, data_dir=None, **keywords): """ """ returnData = PassingData(no_of_jobs=0) returnData.jobDataLs = [] alignmentJobAndOutputLs = getattr(passingData, 'alignmentJobAndOutputLs', []) bamFnamePrefix = passingData.bamFnamePrefix topOutputDirJob = passingData.topOutputDirJob individual_alignment = passingData.individual_alignment reduceOutputDirJob = passingData.reduceOutputDirJob if len(alignmentJobAndOutputLs) > 0: #2012.3.29 merge alignment output only when there is something to merge! #2013.04.09 create a new child alignment local_realigned =1, etc. new_individual_alignment = self.db.copyParentIndividualAlignment( parent_individual_alignment_id=individual_alignment.id,\ mask_genotype_method_id=self.new_mask_genotype_method_id,\ data_dir=self.data_dir, local_realigned=1) baseCoverage = 4 #baseline actualCoverage = getattr(individual_alignment.individual_sequence, 'coverage', baseCoverage) minMergeAlignmentWalltime = 240 #in minutes, 4 hours, when coverage is defaultCoverage maxMergeAlignmentWalltime = 2880 #in minutes, 2 days minMergeAlignmentMaxMemory = 7000 #in MB, when coverage is defaultCoverage maxMergeAlignmentMaxMemory = 12000 #in MB mergeAlignmentWalltime = self.scaleJobWalltimeOrMemoryBasedOnInput( realInputVolume=actualCoverage, baseInputVolume=baseCoverage, baseJobPropertyValue=minMergeAlignmentWalltime * 2, minJobPropertyValue=minMergeAlignmentWalltime, maxJobPropertyValue=maxMergeAlignmentWalltime).value mergeAlignmentMaxMemory = self.scaleJobWalltimeOrMemoryBasedOnInput( realInputVolume=actualCoverage, baseInputVolume=baseCoverage, baseJobPropertyValue=minMergeAlignmentMaxMemory, minJobPropertyValue=minMergeAlignmentMaxMemory, maxJobPropertyValue=maxMergeAlignmentMaxMemory).value # replace read_group with the new one to each alignment job newAlignmentJobAndOutputLs = [] for alignmentJobAndOutput in alignmentJobAndOutputLs: # add a AddReadGroup job alignmentJob, indexAlignmentJob = alignmentJobAndOutput.jobLs[: 2] fileBasenamePrefix = os.path.splitext( alignmentJob.output.name)[0] outputRGBAM = File("%s.isq_RG.bam" % (fileBasenamePrefix)) # needBAMIndexJob=False because addAlignmentMergeJob() # does not need .bai. addRGJob = self.addReadGroupJob( individual_alignment=new_individual_alignment, inputBamFile=alignmentJob.output, outputBamFile=outputRGBAM, needBAMIndexJob=False, parentJobLs=[alignmentJob, indexAlignmentJob], extraDependentInputLs=alignmentJob.outputLs[1:], job_max_memory=2500, transferOutput=False, walltime=max(180, mergeAlignmentWalltime / 20)) newAlignmentJobAndOutputLs.append( PassingData(jobLs=[addRGJob], file=addRGJob.output)) mergedBamFile = File(os.path.join(reduceOutputDirJob.output, \ '%s_recal.bam'%(bamFnamePrefix))) alignmentMergeJob, bamIndexJob = self.addAlignmentMergeJob( alignmentJobAndOutputLs=newAlignmentJobAndOutputLs, outputBamFile=mergedBamFile, needBAMIndexJob=True, parentJobLs=[reduceOutputDirJob], walltime=mergeAlignmentWalltime, job_max_memory=mergeAlignmentMaxMemory, transferOutput=False) #2012.9.19 add/copy the alignment file to db-affliated storage #add the metric file to AddAlignmentFile2DB.py as well # (to be moved into db-affiliated storage) logFile = File( os.path.join(reduceOutputDirJob.output, '%s_2db.log' % (bamFnamePrefix))) alignment2DBJob = self.addAlignmentFile2DBJob( executable=self.AddAlignmentFile2DB, inputFile=alignmentMergeJob.output, baiFile=bamIndexJob.baiFile, individual_alignment_id=new_individual_alignment.id, mask_genotype_method_id=self.new_mask_genotype_method_id, logFile=logFile, data_dir=data_dir, otherInputFileList=None, parentJobLs=[alignmentMergeJob, bamIndexJob], transferOutput=transferOutput, sshDBTunnel=self.needSSHDBTunnel, commit=True, job_max_memory=2000, walltime=max(180, mergeAlignmentWalltime / 2)) self.no_of_jobs += 1 returnData.jobDataLs.append(PassingData(jobLs=[alignment2DBJob], file=alignment2DBJob.logFile, \ fileLs=[alignment2DBJob.logFile])) return returnData