def run(self, impl, dependList): impl.log.info("step: jointcalling!") inputInfo = self.results[dependList[0]].output result = bundle(output=bundle(), script=bundle()) hadoop_parameter = '' if self.hadoop.get('queue'): hadoop_parameter += ' -D mapreduce.job.queuename={} '.format( self.hadoop.queue) # extend program path self.jointcalling.program = self.expath('jointcalling.program') # global param ParamDict = self.file.copy() ParamDict.update({ "PROGRAM": "%s jar %s JointCalling %s" % (self.hadoop.bin, self.jointcalling.program, hadoop_parameter), "REF": "file://%s" % self.ref.normal.gaeaIndex, "REDUCERNUM": self.hadoop.reducer_num }) # script template fs_cmd = self.fs_cmd cmd = [] cmd.append("%s ${INPUT}/_*" % fs_cmd.delete) cmd.append("%s ${OUTDIR}" % fs_cmd.delete) cmd.append( "${PROGRAM} -i ${INPUT} -o ${OUTDIR} -r ${REF} -n ${REDUCERNUM} %s" % self.jointcalling.parameter) JobParamList = [] for sampleName in inputInfo: scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName) hdfs_outputPath = os.path.join(self.option.dirHDFS, sampleName, 'jointcalling_output') result.output[sampleName] = os.path.join(hdfs_outputPath, 'vcf') # global param JobParamList.append({ "SAMPLE": sampleName, "SCRDIR": scriptsdir, "INPUT": inputInfo[sampleName], "OUTDIR": hdfs_outputPath }) # write script scriptPath = \ impl.write_scripts( name='jointcalling', commands=cmd, JobParamList=JobParamList, paramDict=ParamDict) # result result.script.update(scriptPath) return result
class clean(Workflow): """ clean """ INIT = bundle(clean=bundle()) INIT.clean.program = "/szhwfs1/ST_HEALTH/GENOME_APP/F16ZQSB1SY2582/personalgenome/lib/genome_api_for_gaea.pl" INIT.clean.parameter = '' def run(self, impl, dependList): impl.log.info("step: clean!") inputInfo = self.results[dependList[0]].output result = bundle(output=bundle(), script=bundle()) #extend program path #self.clean.program = self.expath('clean.program') #script template fs_cmd = self.fs_cmd cmd = [] cmd.append("source %s/bin/activate" % self.GAEA_HOME) cmd.append( "check.py -s %s/state.json -n ${SAMPLE} -t %s %s" % (self.stateDir, ','.join(dependList), self.init.check_state_param)) cmd.append("if [ $? = 0 ];then") cmd.append("%s %s/${SAMPLE}" % (fs_cmd.delete, self.option.dirHDFS)) if self.init.check_state_param: cmd.append("${CPVCF}") cmd.append("fi") JobParamList = [] for sampleName in self.sample: scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName) vcf = '' for step in dependList: vcf_tmp = self.results[step].output[sampleName] if os.path.basename(vcf_tmp).find('vcf') != -1: vcf = vcf_tmp break #global param JobParamList.append({ "SAMPLE": sampleName, "SCRDIR": scriptsdir, "CPVCF": "cp %s /ldfssz1/ST_HEALTH/WGS/project/3000members_hg38/vcf/" % vcf if vcf else '' }) #write script scriptPath = \ impl.write_scripts( name = 'clean', commands=cmd, JobParamList=JobParamList) #result result.script.update(scriptPath) return result
def run(self, impl, dependList): impl.log.info("step: realignment!") inputInfo = self.results[dependList[0]].output result = bundle(output=bundle(),script=bundle()) #extend program path self.realignment.program = self.expath('realignment.program') if self.option.multiSample: self.realignment.parameter += " --multiSample" fs_type = 'file://' if self.hadoop.input_format == 'hdfs': fs_type = '' #global param ParamDict = self.file.copy() ParamDict.update({ "PROGRAM": "%s jar %s Realigner" % (self.hadoop.bin, self.realignment.program), "REF": "file://%s" % self.ref.normal.gaeaIndex, "REDUCERNUM":self.hadoop.reducer_num }) hadoop_parameter = '' if self.hadoop.get('queue'): hadoop_parameter += ' -D mapreduce.job.queuename={} '.format(self.hadoop.queue) #script template fs_cmd = self.fs_cmd cmd = [] cmd.append("%s ${OUTDIR}" % fs_cmd.delete ) cmd.append("%s ${INPUT}/_*" % fs_cmd.delete ) cmd.append("${PROGRAM} %s -i ${INPUT} -o ${OUTDIR} -r ${REF} -n ${REDUCERNUM} %s" % (hadoop_parameter, self.realignment.parameter)) JobParamList = [] for sampleName in inputInfo: scriptsdir = impl.mkdir(self.gaeaScriptsDir,sampleName) hdfs_outputPath = os.path.join(self.option.dirHDFS,sampleName,'realignment_output') #global param JobParamList.append({ "SAMPLE" : sampleName, "SCRDIR" : scriptsdir, "INPUT": inputInfo[sampleName], "OUTDIR": hdfs_outputPath }) result.output[sampleName] = os.path.join(hdfs_outputPath,'fixmate') #write script scriptPath = \ impl.write_scripts( name = 'realignment', commands=cmd, JobParamList=JobParamList, paramDict=ParamDict) #result result.script.update(scriptPath) return result
class baserecal_spark(Workflow): """ baserecal_spark """ INIT = bundle(baserecal_spark=bundle()) INIT.baserecal_spark.bqsr = "/ifs4/ISDC_BD/huweipeng/project/BQSR/GaeaRecalibrationSpark.jar" INIT.baserecal_spark.parameter = "-v file:///ifs4/ISDC_BD/GaeaProject/resource/dbsnp_135.hg19.modify.vcf" def run(self, impl, dependList): impl.log.info("step: baserecal_spark!") inputInfo = self.results[dependList[0]].output result = bundle(output=bundle(),script=bundle()) #extend program path self.baserecal_spark.bqsr = self.expath('baserecal_spark.bqsr') if self.option.multiSample: self.baserecal_spark.parameter += " -MutiSample " #global param ParamDict = self.file.copy() ParamDict.update({ "PROGRAM_BQSR": "spark-submit --master yarn --num-executors 192 --executor-cores 1 --executor-memory 6g %s -n 2000" % self.baserecal_spark.bqsr, "REF": "file://%s" % self.ref.normal.gaeaIndex }) #script template fs_cmd = self.fs_cmd cmd = [] cmd.append("%s ${INPUT}/_*" % fs_cmd.delete ) cmd.append("%s ${OUTDIR_BQSR}" % fs_cmd.delete) cmd.append("${PROGRAM_BQSR} -i ${INPUT} -o ${OUTDIR_BQSR} --ref ${REF} %s" %self.baserecal_spark.parameter) JobParamList = [] for sampleName in inputInfo: scriptsdir = impl.mkdir(self.gaeaScriptsDir,sampleName) hdfs_outputPath = os.path.join(self.option.dirHDFS,sampleName,'baserecal_spark_output') #global param JobParamList.append({ "SAMPLE" : sampleName, "SCRDIR" : scriptsdir, "INPUT": inputInfo[sampleName], "OUTDIR_BQSR": hdfs_outputPath }) result.output[sampleName] = hdfs_outputPath #write script scriptPath = \ impl.write_scripts( name = 'baserecal_spark', commands=cmd, JobParamList=JobParamList, paramDict=ParamDict) #result result.script.update(scriptPath) return result
class cnv(Workflow): """ cnv """ INIT = bundle(cnv=bundle()) INIT.cnv.program = "" INIT.cnv.parameter = "" def run(self, impl, dependList): impl.log.info("step: cnv!") # depend bamQC inputInfo = self.results[dependList[0]].output result = bundle(script=bundle()) multi_sample = self.option.multiSampleName scriptsdir = impl.mkdir(self.option.workdir,"scripts",'standalone',multi_sample) #extend program path self.cnv.program = self.expath('cnv.program') temp = impl.mkdir(self.option.workdir,'temp') annolist = os.path.join(temp,'anno_depth.list') with open(annolist,'w') as f: if self.option.multiSample: for sample in self.sample: anno_region = os.path.join(inputInfo[multi_sample],'%s.anno_region.txt' % sample) line = "%s\t%s\n" % (sample,anno_region) f.write(line) else: for sampleName in inputInfo: anno_region = os.path.join(inputInfo[sampleName],'%s.anno_region.txt' % sampleName) line = "%s\t%s\n" % (sampleName,anno_region) f.write(line) _,output = commands.getstatusoutput('perl %s/bin/require_config.pl %s' % (self.GAEA_HOME,self.file.annoProtoclConfig)) config = eval(output) self.cnv.parameter += ' -trans %s' % config['trans'] #global param ParamDict = { "PROGRAM": "perl %s" % self.cnv.program, "OUTPUT" : impl.mkdir(self.option.workdir,'variation','cnv'), "ANNOLIST":annolist, "SAMPLELIST": self.option.sampleList } #script template cmd = ["${PROGRAM} -output ${OUTPUT} -QC ${ANNOLIST} -samplelist ${SAMPLELIST} %s" %self.cnv.parameter] #write script scriptPath = \ impl.write_shell( name = 'cnv', scriptsdir = scriptsdir, commands=cmd, paramDict=ParamDict) #result result.script[multi_sample] = scriptPath return result
class ubammerge(Workflow): """ ubammerge """ INIT = bundle(ubammerge=bundle()) INIT.ubammerge.program = "/hwfssz1/BIGDATA_COMPUTING/software/source/gatk4/gatk" INIT.ubammerge.parameter = '' def run(self, impl, dependList): impl.log.info("step: ubammerge!") inputInfo = self.results[dependList[0]].output result = bundle(output=bundle(), script=bundle()) sampleName = self.option.multiSampleName merge_tmp = impl.mkdir(self.option.workdir, "temp", sampleName, 'ubammerge.bam') # extend program path self.ubammerge.program = self.expath('ubammerge.program') # script template fs_cmd = self.fs_cmd cmd = [] # cmd.append("%s ${OUTDIR}/" % fs_cmd.delete) # cmd.append("%s ${INPUT}/*/_SUCCESS ${INPUT}/*/_logs" % fs_cmd.delete) scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName) hdfs_outputPath = os.path.join(self.option.dirHDFS, sampleName, 'ubammerge_output') bams = [] for sample_name in inputInfo: sample_input = inputInfo[sample_name] for dataTag in sample_input: bams.append(sample_input[dataTag]['bam']) if len(bams) <= 1: merge_tmp = bams[0] else: input_bam_command = '' for input_bam in bams: input_bam_command += "--UNMAPPED_BAM {} ".format(input_bam) cmd.append('%s MergeBamAlignment %s -O %s -R %s' % (self.ubammerge.program, input_bam_command, merge_tmp, self.ref.normal.ref)) cmd.append('%s fs -mkdir -p %s' % (self.hadoop.bin, hdfs_outputPath)) cmd.append('%s fs -put %s %s' % (self.hadoop.bin, merge_tmp, hdfs_outputPath)) # write script scriptPath = \ impl.write_shell( name='ubammerge', scriptsdir=scriptsdir, commands=cmd, paramDict=[]) # result result.output[sampleName] = hdfs_outputPath result.script[sampleName] = scriptPath return result
def run(self, impl, dependList): impl.log.info("step: genotype!") inputInfo = self.results[dependList[0]].output result = bundle(output=bundle(),script=bundle()) #extend program path self.genotype.program = self.expath('genotype.program') if not self.option.multiSample: if self.genotype.parameter.find('-noMultiSampleCall') != -1: impl.log.warning("Pipeline is in single sample mode, disable -noMultiSampleCall. (deleted)") self.genotype.parameter = self.genotype.parameter.replace('-noMultiSampleCall','') if self.file.get("regionVariation"): self.genotype.parameter += " -intervals file://%s " % self.file.regionVariation elif self.file.get("region"): self.genotype.parameter += " -intervals file://%s " % self.file.region #global param ParamDict = self.file.copy() ParamDict.update({ "PROGRAM": "%s jar %s" % (self.hadoop.bin, self.genotype.program), "REF": "file://%s" % self.ref.normal.gaeaIndex, "REDUCERNUM":self.hadoop.reducer_num }) #script template fs_cmd = self.fs_cmd cmd = [] cmd.append("%s ${INPUT}/_*" % fs_cmd.delete ) cmd.append("%s ${OUTDIR}" % fs_cmd.delete ) cmd.append("${PROGRAM} -input ${INPUT} -out ${OUTDIR} -ref ${REF} -reduceNum ${REDUCERNUM} %s" %self.genotype.parameter ) JobParamList = [] for sampleName in inputInfo: scriptsdir = impl.mkdir(self.gaeaScriptsDir,sampleName) hdfs_outputPath = os.path.join(self.option.dirHDFS,sampleName,'genotype_output') result.output[sampleName] = hdfs_outputPath #global param JobParamList.append({ "SAMPLE" : sampleName, "SCRDIR" : scriptsdir, "INPUT": inputInfo[sampleName], "OUTDIR": hdfs_outputPath }) #write script scriptPath = \ impl.write_scripts( name = 'genotype', commands=cmd, JobParamList=JobParamList, paramDict=ParamDict) #result result.script.update(scriptPath) return result
def run(self, impl, dependList): impl.log.info("step: realignment!") inputInfo = self.results[dependList[0]].output result = bundle(output=bundle(), script=bundle()) #extend program path self.realignment.program = self.expath('realignment.program') if self.option.multiSample: self.realignment.parameter += " --mutiSample" #global param ParamDict = self.file.copy() ParamDict.update({ "PROGRAM": "%s jar %s" % (self.hadoop.bin, self.realignment.program), "REF": "file://%s" % self.ref.normal.gaeaIndex, "REDUCERNUM": self.hadoop.reducer_num }) #script template fs_cmd = self.fs_cmd cmd = [] cmd.append("%s ${OUTDIR}" % fs_cmd.delete) cmd.append("%s ${INPUT}/_*" % fs_cmd.delete) cmd.append( "${PROGRAM} --align ${INPUT} --out ${OUTDIR} --ref ${REF} --reducer ${REDUCERNUM} %s" % self.realignment.parameter) JobParamList = [] for sampleName in inputInfo: scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName) hdfs_outputPath = os.path.join(self.option.dirHDFS, sampleName, 'realignment_output') #global param JobParamList.append({ "SAMPLE": sampleName, "SCRDIR": scriptsdir, "INPUT": inputInfo[sampleName], "OUTDIR": hdfs_outputPath }) result.output[sampleName] = os.path.join(hdfs_outputPath, 'FixMateResult') #write script scriptPath = \ impl.write_scripts( name = 'realignment', commands=cmd, JobParamList=JobParamList, paramDict=ParamDict) #result result.script.update(scriptPath) return result
def parseState(self): f = open(self.config, 'r') state = bundle() try: data = f.read() state = bundle(clean(json.loads(data))) except Exception,e: print Exception,"%s, "%self.config,e
def run(self, impl, dependList): impl.log.info("step: baserecal!") inputInfo = self.results[dependList[0]].output result = bundle(output=bundle(),script=bundle()) #extend program path self.baserecal.bqsr = self.expath('baserecal.bqsr') self.baserecal.printreads = self.expath('baserecal.printreads') if self.option.multiSample: self.baserecal.bqsr_param += " -MutiSample " #global param ParamDict = self.file.copy() ParamDict.update({ "PROGRAM_BQSR": "%s jar %s" % (self.hadoop.bin, self.baserecal.bqsr), "PROGRAM_PR": "%s jar %s" % (self.hadoop.bin, self.baserecal.printreads), "REF": "file://%s" % self.ref.normal.gaeaIndex, "REDUCERNUM":self.hadoop.reducer_num }) #script template fs_cmd = self.fs_cmd cmd = [] cmd.append("%s ${INPUT}/_*" % fs_cmd.delete ) cmd.append("%s ${OUTDIR_BQSR}" % fs_cmd.delete ) cmd.append("${PROGRAM_BQSR} -input ${INPUT} -output ${OUTDIR_BQSR} -ref ${REF} -n ${REDUCERNUM} %s" %self.baserecal.bqsr_param) cmd.append("sleep 10") cmd.append("%s ${OUTDIR_PR}" % fs_cmd.delete ) cmd.append("${PROGRAM_PR} -i ${INPUT} -o ${OUTDIR_PR} -f ${REF} -b ${OUTDIR_BQSR}/result.grp %s" %self.baserecal.printreads_param) JobParamList = [] for sampleName in inputInfo: scriptsdir = impl.mkdir(self.gaeaScriptsDir,sampleName) hdfs_outputPath = os.path.join(self.option.dirHDFS,sampleName,'basequalityrecal_output') #global param JobParamList.append({ "SAMPLE" : sampleName, "SCRDIR" : scriptsdir, "INPUT": inputInfo[sampleName], "OUTDIR_BQSR": os.path.join(hdfs_outputPath,"gaeaoutdb"), "OUTDIR_PR": os.path.join(hdfs_outputPath,"printreads") }) result.output[sampleName] = os.path.join(hdfs_outputPath,'printreads','result') #write script scriptPath = \ impl.write_scripts( name = 'baserecal', commands=cmd, JobParamList=JobParamList, paramDict=ParamDict) #result result.script.update(scriptPath) return result
def run(self, impl, dependList): impl.log.info("step: merge_vcf!") inputInfo = self.results[dependList[0]].output result = bundle(output=bundle(), script=bundle()) # extend program path self.merge_vcf.program = self.expath('merge_vcf.program') self.merge_vcf.bcftools = self.expath('merge_vcf.bcftools', False) # global param hadoop_parameter = '' if self.hadoop.get('queue'): hadoop_parameter += ' -D mapreduce.job.queuename={} '.format( self.hadoop.queue) ParamDict = { "PROGRAM": "%s jar %s" % (self.hadoop.bin, self.merge_vcf.program), "HADOOPPARAM": hadoop_parameter } JobParamList = [] for sampleName in inputInfo: scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName) outputPath = impl.mkdir(self.option.workdir, "variation", sampleName) result.output[sampleName] = os.path.join( outputPath, "{}.hc.vcf.gz".format(sampleName)) # global param JobParamList.append({ "SAMPLE": sampleName, "SCRDIR": scriptsdir, "INPUT": inputInfo[sampleName], "VCF": result.output[sampleName] }) cmd = [ "%s ${INPUT}/_*" % self.fs_cmd.delete, '${PROGRAM} SortVcf ${HADOOPPARAM} -input ${INPUT} -output file://${VCF}\n' ] if self.merge_vcf.bcftools: cmd.append( "%s index %s ${VCF}" % (self.merge_vcf.bcftools, self.merge_vcf.bcftools_param)) # write script scriptPath = \ impl.write_scripts( name='merge_vcf', commands=cmd, JobParamList=JobParamList, paramDict=ParamDict) # result result.script.update(scriptPath) return result
def run(self, impl, dependList): impl.log.info("step: haplotypeCaller!") inputInfo = self.results[dependList[0]].output result = bundle(output=bundle(), script=bundle()) # extend program path self.haplotypeCaller.program = self.expath('haplotypeCaller.program') if self.file.get("regionVariation"): self.haplotypeCaller.parameter += " -L %s " % self.file.regionVariation elif self.file.get("region"): self.haplotypeCaller.parameter += " -L %s " % self.file.region # global param ParamDict = self.file.copy() ParamDict.update({ "PROGRAM": "/home/huangzhibo/java -jar {} -T HaplotypeCaller ".format( self.haplotypeCaller.program), "REF": self.ref.normal.ref }) # script template cmd = [ "${PROGRAM} -I ${INPUT} -o ${OUTDIR} -R ${REF} %s" % self.haplotypeCaller.parameter ] JobParamList = [] for sampleName in inputInfo: scriptsdir = impl.mkdir(self.option.workdir, "scripts", 'standalone', sampleName) outputPath = impl.mkdir(self.option.workdir, "variation", 'haplotypeCaller', sampleName) result.output[sampleName] = os.path.join( outputPath, "{}.hc.vcf.gz".format(sampleName)) # global param JobParamList.append({ "SAMPLE": sampleName, "SCRDIR": scriptsdir, "INPUT": inputInfo[sampleName], "OUTDIR": result.output[sampleName] }) # write script scriptPath = \ impl.write_scripts( name='haplotypeCaller', commands=cmd, JobParamList=JobParamList, paramDict=ParamDict) # result result.script.update(scriptPath) return result
def run(self, impl, dependList): impl.log.info("step: evaSNP!") inputInfo = self.results[dependList[0]].output result = bundle(output=bundle(), script=bundle()) #extend program path self.evaSNP.program = self.expath('evaSNP.program') if not self.evaSNP.chip_vcf: impl.log.error("No chipVCF value for evaSNP step. please set it.") #script template cmd = [] cmd.append( "${PROGRAM} ${PARAM} -c ${REFVCF} -r ${VCF} ${DBSNP} -o ${OUTPUT}") cmd.append( 'if [ $? -ne 0 ]; then\n\techo "[WARNING] ${SAMPLE} - evaSNP failed." >> %s\n\texit 1\nelse' % self.logfile) cmd.append( '\techo "[INFO ] ${SAMPLE} - evaSNP complete." >> %s\n\texit 1\nfi' % self.logfile) dbsnp = '' if self.evaSNP.dbsnp: dbsnp = "-d %s" % self.evaSNP.dbsnp for sampleName in inputInfo: scriptsdir = impl.mkdir(self.option.workdir, "scripts", 'standalone', sampleName) vcf = inputInfo[sampleName] outdir = impl.mkdir(self.option.workdir, "QC", 'evaVCF', sampleName) output = os.path.join(outdir, 'evaSNP.txt') ParamDict = { "SAMPLE": sampleName, "PROGRAM": "perl %s" % self.evaSNP.program, "REFVCF": self.evaSNP.chip_vcf, "VCF": vcf, "DBSNP": dbsnp, "OUTPUT": output, "PARAM": self.evaSNP.parameter } #write script scriptPath = \ impl.write_shell( name = 'evaSNP', scriptsdir = scriptsdir, commands=cmd, paramDict=ParamDict) #result result.output[sampleName] = output result.script[sampleName] = scriptPath return result
def run(self, impl, dependList): ''' dependList是该步骤的依赖步骤列表,如['S','HelloWorld','bamSort'],则dependList==['bamSort'] self.results是一个包装了的字典类型(bundle,可通过'.'取值),其中存储了各步骤的输出信息, 如下 self.results = \ { "bamSort": { "output": { "sample1": "/path/sample1.bam", "sample2": "/path/sample2.bam" }, "script": { "sample1": "/path/sample1/bamSort.sh", "sample2": "/path/sample2/bamSort.sh" } }, ... } 从self.results中获取bamSort步骤的输出信息:inputInfo = self.results.bamSort.output ''' impl.log.info("step: HelloWorld!") inputInfo = self.results[dependList[0]].output #result 定义返回值,将被赋值给 self.results.HelloWorld, 其中script必须设置用以提交任务,output如果不设置则该APP不能被依赖 result = bundle(output=bundle(),script=bundle()) #extend program path (get abs path) self.HelloWorld.program = self.expath('HelloWorld.program') #script template 生成脚本,cmd是个列表,每个值生成shell脚本的一行,${XXX}将被ParamDict中的值替换 cmd = [] cmd.append('%s index ${PARAM} ${INPUT}' % self.HelloWorld.program) cmd.append('echo "Hello World!"') for sampleName in inputInfo: scriptsdir = impl.mkdir(self.scriptsDir,'standalone',sampleName) ParamDict = { "INPUT": inputInfo[sampleName], "PARAM":self.HelloWorld.parameter } #write script scriptPath = \ impl.write_shell( name = 'HelloWorld', scriptsdir = scriptsdir, commands=cmd, paramDict=ParamDict) #result result.output[sampleName] = inputInfo[sampleName] result.script[sampleName] = scriptPath return result
def get_SGE_state(jobId): sge = bundle() f = open(jobId,'r') for line in f: line = line.strip() field = line.split('\t') if not sge.has_key(field[0]): sge[field[0]] = bundle() sge[field[0]][field[1]] = field[2] return sge
def get_SGE_state(jobId): sge = bundle() f = open(jobId, 'r') for line in f: line = line.strip() field = line.split('\t') if not sge.has_key(field[0]): sge[field[0]] = bundle() sge[field[0]][field[1]] = field[2] return sge
def run(self, impl, dependList=None): impl.log.info("step: newCnv!") # depend bamQC result = bundle(script=bundle()) multi_sample = self.option.multiSampleName scriptsdir = impl.mkdir(self.option.workdir, "scripts", 'standalone', multi_sample) # extend program path self.newCnv.program = self.expath('newCnv.program') if self.file.has_key('newCnvConfig'): self.file.newCnvConfig = self.expath('file.newCnvConfig') self.newCnv.parameter += " %s" % self.file.newCnvConfig else: raise RuntimeError("newCnv Config file don't exists!") if self.file.has_key('cnvRegions'): self.file.cnvRegions = self.expath('file.cnvRegions') else: raise RuntimeError("file.cnvRegions don't exists!") poolingList = self.getPoolingList() if len(poolingList) == 0: raise RuntimeError("pooling info must be setted for CNV analysis!") cmd = [] for pool in poolingList: cnvscriptsdir = impl.mkdir(self.option.workdir, "variation", 'cnv', pool) script = self.poolingScript(impl, cnvscriptsdir) self.writeSampleList(pool, cnvscriptsdir) cmd.append("cd %s" % cnvscriptsdir) cmd.append("sh %s >%s.o 2>%s.e" % (script, script, script)) cmd.append( 'if [ $? -ne 0 ]; then\n\techo "[WARNING] %s - newCnv failed." >> %s' % (pool, self.logfile)) cmd.append('\texit 1\nelse') cmd.append( '\techo "[INFO ] %s - newCnv complete." >> %s\nfi\n' % (pool, self.logfile)) # write script scriptPath = \ impl.write_shell( name='newCnv', scriptsdir=scriptsdir, commands=cmd ) # result result.script[multi_sample] = scriptPath return result
class rmdup(Workflow): """ rmdup """ INIT = bundle(rmdup=bundle()) INIT.rmdup.program = "GaeaDuplicateMarker.jar" INIT.rmdup.parameter_SE = ' -S ' INIT.rmdup.parameter = '' def run(self, impl,dependList): impl.log.info("step: rmdup!") inputInfo = self.results[dependList[0]].output result = bundle(output=bundle(),script=bundle()) #extend program path self.rmdup.program = self.expath('rmdup.program') if self.init.get('isSE'): self.rmdup.parameter = self.rmdup.parameter_SE #script template fs_cmd = self.fs_cmd cmd = [] cmd.append("%s ${OUTDIR}/" % fs_cmd.delete ) cmd.append("%s ${INPUT}/*/_SUCCESS ${INPUT}/*/_logs" % fs_cmd.delete ) cmd.append("${PROGRAM} -I ${INPUT} -O ${OUTDIR} -i 1 -R ${REDUCERNUM} ${PARAM}") for sampleName in inputInfo: scriptsdir = impl.mkdir(self.gaeaScriptsDir,sampleName) hdfs_outputPath = os.path.join(self.option.dirHDFS,sampleName,'rmdup_output') #global param ParamDict = { "PROGRAM": "%s jar %s" % (self.hadoop.bin, self.rmdup.program), "INPUT": inputInfo[sampleName], "OUTDIR": hdfs_outputPath, "REDUCERNUM":self.hadoop.reducer_num, "PARAM":self.rmdup.parameter } #write script scriptPath = \ impl.write_shell( name = 'rmdup', scriptsdir = scriptsdir, commands=cmd, paramDict=ParamDict) #result result.output[sampleName] = os.path.join(hdfs_outputPath,'Mark') result.script[sampleName] = scriptPath return result
def run(self, impl, dependList): impl.log.info("step: cnv!") # depend bamQC inputInfo = self.results[dependList[0]].output result = bundle(script=bundle()) multi_sample = self.option.multiSampleName scriptsdir = impl.mkdir(self.option.workdir,"scripts",'standalone',multi_sample) #extend program path self.cnv.program = self.expath('cnv.program') temp = impl.mkdir(self.option.workdir,'temp') annolist = os.path.join(temp,'anno_depth.list') with open(annolist,'w') as f: if self.option.multiSample: for sample in self.sample: anno_region = os.path.join(inputInfo[multi_sample],'%s.anno_region.txt' % sample) line = "%s\t%s\n" % (sample,anno_region) f.write(line) else: for sampleName in inputInfo: anno_region = os.path.join(inputInfo[sampleName],'%s.anno_region.txt' % sampleName) line = "%s\t%s\n" % (sampleName,anno_region) f.write(line) _,output = commands.getstatusoutput('perl %s/bin/require_config.pl %s' % (self.GAEA_HOME,self.file.annoProtoclConfig)) config = eval(output) self.cnv.parameter += ' -trans %s' % config['trans'] #global param ParamDict = { "PROGRAM": "perl %s" % self.cnv.program, "OUTPUT" : impl.mkdir(self.option.workdir,'variation','cnv'), "ANNOLIST":annolist, "SAMPLELIST": self.option.sampleList } #script template cmd = ["${PROGRAM} -output ${OUTPUT} -QC ${ANNOLIST} -samplelist ${SAMPLELIST} %s" %self.cnv.parameter] #write script scriptPath = \ impl.write_shell( name = 'cnv', scriptsdir = scriptsdir, commands=cmd, paramDict=ParamDict) #result result.script[multi_sample] = scriptPath return result
def run(self, impl, dependList): impl.log.info("step: BQSRSpark!") inputInfo = self.results[dependList[0]].output result = bundle(output=bundle(), script=bundle()) # extend program path self.BQSRSpark.program = self.expath('BQSRSpark.program') spark_param = self.BQSRSpark.parameter_spark if self.hadoop.get('queue'): spark_param = impl.paramCheck(True, spark_param, '--queue', self.hadoop.queue) # script template fs_cmd = self.fs_cmd cmd = [] cmd.append("%s ${OUTPUT}" % fs_cmd.delete) cmd.append("%s ${INPUT}/*/_SUCCESS ${INPUT}/*/_logs" % fs_cmd.delete) cmd.append( "${PROGRAM} BQSRPipelineSpark -I ${INPUT} -O ${OUTPUT} -R ${REF} %s -- %s" % (self.BQSRSpark.parameter, spark_param)) # global param ParamDict = self.file.copy() ParamDict.update({ "PROGRAM": self.BQSRSpark.program, "REF": self.ref.normal.ref }) JobParamList = [] for sampleName in inputInfo: scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName) hdfs_outputPath = os.path.join(self.option.dirHDFS, sampleName, 'BQSRSpark_output') result.output[sampleName] = hdfs_outputPath # global param JobParamList.append({ "SAMPLE": sampleName, "SCRDIR": scriptsdir, "INPUT": inputInfo[sampleName], "OUTPUT": hdfs_outputPath, }) scriptPath = \ impl.write_scripts( name='BQSRSpark', commands=cmd, JobParamList=JobParamList, paramDict=ParamDict) result.script.update(scriptPath) return result
def run(self, impl, dependList): impl.log.info("step: baserecal_spark!") inputInfo = self.results[dependList[0]].output result = bundle(output=bundle(),script=bundle()) #extend program path self.baserecal_spark.bqsr = self.expath('baserecal_spark.bqsr') if self.option.multiSample: self.baserecal_spark.parameter += " -MutiSample " #global param ParamDict = self.file.copy() ParamDict.update({ "PROGRAM_BQSR": "spark-submit --master yarn --num-executors 192 --executor-cores 1 --executor-memory 6g %s -n 2000" % self.baserecal_spark.bqsr, "REF": "file://%s" % self.ref.normal.gaeaIndex }) #script template fs_cmd = self.fs_cmd cmd = [] cmd.append("%s ${INPUT}/_*" % fs_cmd.delete ) cmd.append("%s ${OUTDIR_BQSR}" % fs_cmd.delete) cmd.append("${PROGRAM_BQSR} -i ${INPUT} -o ${OUTDIR_BQSR} --ref ${REF} %s" %self.baserecal_spark.parameter) JobParamList = [] for sampleName in inputInfo: scriptsdir = impl.mkdir(self.gaeaScriptsDir,sampleName) hdfs_outputPath = os.path.join(self.option.dirHDFS,sampleName,'baserecal_spark_output') #global param JobParamList.append({ "SAMPLE" : sampleName, "SCRDIR" : scriptsdir, "INPUT": inputInfo[sampleName], "OUTDIR_BQSR": hdfs_outputPath }) result.output[sampleName] = hdfs_outputPath #write script scriptPath = \ impl.write_scripts( name = 'baserecal_spark', commands=cmd, JobParamList=JobParamList, paramDict=ParamDict) #result result.script.update(scriptPath) return result
def run(self, impl, dependList): impl.log.info("step: merge_vcf!") inputInfo = self.results[dependList[0]].output result = bundle(output=bundle(), script=bundle()) # extend program path self.merge_vcf.program = self.expath('merge_vcf.program') self.merge_vcf.bcftools = self.expath('merge_vcf.bcftools', False) # global param hadoop_parameter = '' if self.hadoop.get('queue'): hadoop_parameter += ' -D mapreduce.job.queuename={} '.format(self.hadoop.queue) ParamDict = { "PROGRAM": "%s jar %s" % (self.hadoop.bin, self.merge_vcf.program), "HADOOPPARAM": hadoop_parameter } JobParamList = [] for sampleName in inputInfo: scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName) outputPath = impl.mkdir(self.option.workdir, "variation", sampleName) result.output[sampleName] = os.path.join(outputPath, "{}.hc.vcf.gz".format(sampleName)) # global param JobParamList.append({ "SAMPLE": sampleName, "SCRDIR": scriptsdir, "INPUT": inputInfo[sampleName], "VCF": result.output[sampleName] }) cmd = ["%s ${INPUT}/_*" % self.fs_cmd.delete, '${PROGRAM} SortVcf ${HADOOPPARAM} -input ${INPUT} -output file://${VCF}\n'] if self.merge_vcf.bcftools: cmd.append("%s index %s ${VCF}" % (self.merge_vcf.bcftools, self.merge_vcf.bcftools_param)) # write script scriptPath = \ impl.write_scripts( name='merge_vcf', commands=cmd, JobParamList=JobParamList, paramDict=ParamDict) # result result.script.update(scriptPath) return result
class spark_rmdup(Workflow): """ spark_rmdup """ INIT = bundle(spark_rmdup=bundle()) INIT.spark_rmdup.program = "/ifs4/ISDC_BD/huangzhibo/test/testSpark/20160623/DuplicationMark.jar" INIT.spark_rmdup.parameter = ' -i 1 ' def run(self, impl, dependList): impl.log.info("step: spark_rmdup!") inputInfo = self.results[dependList[0]].output result = bundle(output=bundle(), script=bundle()) #extend program path self.spark_rmdup.program = impl.expath(self.Path.prgDir, self.spark_rmdup.program) #script template fs_cmd = self.fs_cmd cmd = [] cmd.append("%s ${OUTDIR}/" % fs_cmd.delete) cmd.append("%s ${INPUT}/*/_SUCCESS ${INPUT}/*/_logs" % fs_cmd.delete) cmd.append( "spark-submit --class org.bgi.flexlab.gaea.spark.example.DuplicationMark --master yarn --num-executors 48 --driver-memory 8g --executor-memory 25g --executor-cores 4 --queue spark_queue ${PROGRAM} -I ${INPUT} -O ${OUTDIR} ${PARAM}" ) for sampleName in inputInfo: scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName) hdfs_outputPath = os.path.join(self.option.dirHDFS, sampleName, 'spark_rmdup_output') #global param ParamDict = { "PROGRAM": self.spark_rmdup.program, "INPUT": inputInfo[sampleName], "OUTDIR": hdfs_outputPath, "PARAM": self.spark_rmdup.parameter } #write script scriptPath = \ impl.write_shell( name = 'spark_rmdup', scriptsdir = scriptsdir, commands=cmd, paramDict=ParamDict) #result result.output[sampleName] = os.path.join(hdfs_outputPath, 'Mark') result.script[sampleName] = scriptPath return result
def parse_sample(sampleList): sample_lane_counter = 0 total_number = 0 with open(sampleList,'r') as sampleFile: sampleInfo = bundle() gender = 'male' thetype = '' pool = '' for line in sampleFile: line = line.strip() #sample_name, gender, family, type, pool, fq1, fq2, insert_size, fq1s field = line.split() field_num = len(field) sample_name = field[0] family = field[0] fq1 = field[1] fq2 = field[2] tmp = os.path.basename(fq1).split("_") rg_ID = tmp[0] rg_PU = total_number rg_LB = total_number rg = "@RG\\tID:%s\\tPL:illumina\\tPU:%s\\tLB:%s\\tSM:%s\\tCN:BGI" % (rg_ID,rg_PU,rg_LB,sample_name) fq_lib_name = rg_ID total_number += 1 if sample_name not in sampleInfo: sampleInfo[sample_name] = bundle() sample_lane_counter = 0 else: sample_lane_counter = len(sampleInfo[sample_name]) dataTag = 'data'+str(sample_lane_counter) if dataTag not in sampleInfo[sample_name]: sampleInfo[sample_name][dataTag] = bundle() sampleInfo[sample_name][dataTag]['fq1'] = fq1 sampleInfo[sample_name][dataTag]['fq2'] = fq2 sampleInfo[sample_name][dataTag]['adp1'] = 'null' sampleInfo[sample_name][dataTag]['adp2'] = 'null' sampleInfo[sample_name][dataTag]['gender'] = gender sampleInfo[sample_name][dataTag]['family'] = family sampleInfo[sample_name][dataTag]['type'] = thetype sampleInfo[sample_name][dataTag]['pool'] = pool sampleInfo[sample_name][dataTag]['rg'] = rg sampleInfo[sample_name][dataTag]['libname'] = fq_lib_name return sampleInfo
def run(self, impl, dependList): impl.log.info("step: BQSRSpark!") inputInfo = self.results[dependList[0]].output result = bundle(output=bundle(), script=bundle()) # extend program path self.BQSRSpark.program = self.expath('BQSRSpark.program') spark_param = self.BQSRSpark.parameter_spark if self.hadoop.get('queue'): spark_param = impl.paramCheck(True, spark_param, '--queue', self.hadoop.queue) # script template fs_cmd = self.fs_cmd cmd = [] cmd.append("%s ${OUTPUT}" % fs_cmd.delete) cmd.append("%s ${INPUT}/*/_SUCCESS ${INPUT}/*/_logs" % fs_cmd.delete) cmd.append("${PROGRAM} BQSRPipelineSpark -I ${INPUT} -O ${OUTPUT} -R ${REF} %s -- %s" % ( self.BQSRSpark.parameter, spark_param)) # global param ParamDict = self.file.copy() ParamDict.update({ "PROGRAM": self.BQSRSpark.program, "REF": self.ref.normal.ref }) JobParamList = [] for sampleName in inputInfo: scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName) hdfs_outputPath = os.path.join(self.option.dirHDFS, sampleName, 'BQSRSpark_output') result.output[sampleName] = hdfs_outputPath # global param JobParamList.append({ "SAMPLE": sampleName, "SCRDIR": scriptsdir, "INPUT": inputInfo[sampleName], "OUTPUT": hdfs_outputPath, }) scriptPath = \ impl.write_scripts( name='BQSRSpark', commands=cmd, JobParamList=JobParamList, paramDict=ParamDict) result.script.update(scriptPath) return result
def run(self, impl, dependList): impl.log.info("step: evaSNP!") inputInfo = self.results[dependList[0]].output result = bundle(output=bundle(),script=bundle()) #extend program path self.evaSNP.program = self.expath('evaSNP.program') if not self.evaSNP.chip_vcf: impl.log.error("No chipVCF value for evaSNP step. please set it.") #script template cmd = [] cmd.append("${PROGRAM} ${PARAM} -c ${REFVCF} -r ${VCF} ${DBSNP} -o ${OUTPUT}") cmd.append('if [ $? -ne 0 ]; then\n\techo "[WARNING] ${SAMPLE} - evaSNP failed." >> %s\n\texit 1\nelse' %self.logfile) cmd.append('\techo "[INFO ] ${SAMPLE} - evaSNP complete." >> %s\n\texit 1\nfi' % self.logfile) dbsnp = '' if self.evaSNP.dbsnp: dbsnp = "-d %s" % self.evaSNP.dbsnp for sampleName in inputInfo: scriptsdir = impl.mkdir(self.option.workdir,"scripts",'standalone',sampleName) vcf = inputInfo[sampleName] outdir = impl.mkdir(self.option.workdir,"QC",'evaVCF',sampleName) output = os.path.join(outdir,'evaSNP.txt') ParamDict = { "SAMPLE":sampleName, "PROGRAM": "perl %s" % self.evaSNP.program, "REFVCF":self.evaSNP.chip_vcf, "VCF": vcf, "DBSNP" :dbsnp, "OUTPUT": output, "PARAM":self.evaSNP.parameter } #write script scriptPath = \ impl.write_shell( name = 'evaSNP', scriptsdir = scriptsdir, commands=cmd, paramDict=ParamDict) #result result.output[sampleName] = output result.script[sampleName] = scriptPath return result
def run(self, impl, dependList): impl.log.info("step: BwaMarkDupSpark!") inputInfo = self.results[dependList[0]].output result = bundle(output=bundle(), script=bundle()) # extend program path self.BwaMarkDupSpark.program = self.expath('BwaMarkDupSpark.program') spark_param = self.BwaMarkDupSpark.parameter_spark if self.hadoop.get('queue'): spark_param = impl.paramCheck(True, spark_param, '--queue', self.hadoop.queue) # script template fs_cmd = self.fs_cmd cmd = [] cmd.append("%s ${OUTPUT}" % fs_cmd.delete) cmd.append("%s ${INPUT}/*/_SUCCESS ${INPUT}/*/_logs" % fs_cmd.delete) cmd.append( "${PROGRAM} BwaAndMarkDuplicatesPipelineSpark -I ${INPUT} -O ${OUTPUT} -R ${REF} ${PARAM} -- ${PARAMSPARK}" ) for sampleName in inputInfo: scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName) hdfs_outputPath = os.path.join(self.option.dirHDFS, sampleName, 'BwaMarkDupSpark_output') # global param ParamDict = { "PROGRAM": self.BwaMarkDupSpark.program, "INPUT": inputInfo[sampleName], "OUTPUT": hdfs_outputPath, "REF": self.ref.normal.ref, "PARAM": self.BwaMarkDupSpark.parameter, "PARAMSPARK": spark_param } # write script scriptPath = \ impl.write_shell( name='BwaMarkDupSpark', scriptsdir=scriptsdir, commands=cmd, paramDict=ParamDict) # result result.output[sampleName] = hdfs_outputPath result.script[sampleName] = scriptPath return result
def write_scripts(self, name, commands, JobParamList=[], paramDict={}): scriptDict = bundle() t = _generate_template(commands) if paramDict: t = Template(t.safe_substitute(paramDict)) for d in JobParamList: scriptsdir = d.get('SCRDIR') sampleName = d.get('SAMPLE') if not scriptsdir or not sampleName: self.log.error( "Error in step (%s) JobParamList(no SMAPLE or SCRDIR)." % name) exit(1) scriptDict[sampleName] = os.path.join(scriptsdir, name + '.sh') script = open(scriptDict[sampleName], 'w') print >> script, '#!/bin/bash' print >> script, "echo ==========start %s at : `date` ==========" % name _script_append(script, t, paramDict=d) print >> script, "" print >> script, "echo ==========end %s at : `date` ========== " % name script.close() return scriptDict
def step(self, name, depends=[], commands=[], hog=False, memory=None, arrayParamValues=None, workflowTasksPerGridTask=1, priority=0, resources={}, concurrencyLimit = None): self._checkname(name) depends = wraplist(depends, 'depends') for dep in depends: if _findstep(dep, self.steps) is None: raise RuntimeError('step named %s does not exist' % dep) commands = wraplist(commands, 'commands') for c in commands: if c.strip()[-1] == '&': raise RuntimeError('commands must not end with "&"') if arrayParamValues is None: arrayParamValues = [] elif 0 == len(arrayParamValues): raise RuntimeError('arrayParamValues has 0 elements') # Validate and set resource defaults. resources = resources.copy() # shallow copy if (memory is not None) and resources.has_key('memorymb'): raise RuntimeError('both memory and memorymb resource cannot be specified in a single step') if memory is not None: resources['memorymb'] = int(memory*1024) if 'cpu' not in resources: resources['cpu'] = 100 if 'memorymb' not in resources: resources['memorymb'] = 1024 memory = float(resources['memorymb'] / 1024) self.steps.append(bundle(name=name, depends=depends, commands=commands, hog=hog, memory=memory, resources=resources, priority=priority, arrayParamValues=arrayParamValues, workflowTasksPerGridTask=workflowTasksPerGridTask, concurrencyLimit=concurrencyLimit)) return name
def write_file(self,fileName,scriptsdir,commands,JobParamList=None,paramDict={},addShellHeader=False): scriptDict = bundle() scriptDict.script = [] t = _generate_template(commands) m = re.match('.*\$\{(\S+)\}.*',fileName) if JobParamList and m: for d in JobParamList: if not d.has_key(m.group(1)): self.log.error("Wrong about impl.write_file paramter: fileName. No %s in JobParamList." % m.group(1)) if paramDict: d.update(paramDict) file_name = _generate_template(fileName).safe_substitute(d) scriptFile = os.path.join(scriptsdir,file_name) scriptDict["script"].append(scriptFile) script = open(scriptFile, 'w') print >>script, t.safe_substitute(d) else: scriptFile = os.path.join(scriptsdir,fileName) scriptDict["script"].append(scriptFile) script = open(scriptFile, 'w') if addShellHeader: print >>script, '#!/bin/bash' print >>script, "echo ==========start %s at : `date` ==========" % os.path.splitext(fileName)[0] _script_append(script, t, JobParamList, paramDict) print >>script, "echo ==========end %s at : `date` ========== " % os.path.splitext(fileName)[0] else: _script_append(script, t, JobParamList, paramDict) script.close() return scriptDict
def write_scripts(self, name, commands, JobParamList=[], paramDict={}): scriptDict = bundle() t = _generate_template(commands) for d in JobParamList: scriptsdir = d.get('SCRDIR') sampleName = d.get('SAMPLE') if not scriptsdir or not sampleName: self.log.error("Error in step (%s) JobParamList(no SMAPLE or SCRDIR)." % name) exit(1) if paramDict: d.update(paramDict) scriptDict[sampleName] = os.path.join(scriptsdir,name+'.sh') script = open(scriptDict[sampleName], 'w') print >>script, '#!/bin/bash' print >>script, "echo ==========start %s at : `date` ==========" % name _script_append(script, t, paramDict=d) print >>script, "" print >>script, "echo ==========end %s at : `date` ========== " % name script.close() return scriptDict
def write_Scripts(self, name, scriptsdir, commands, JobParamList=[], paramDict={}, reducer=True): scriptDict = bundle() t = _generate_template(commands) scriptDict["script"] = [] for n, d in enumerate(JobParamList): if paramDict: d.update(paramDict) dataTag = str(n) if d.get('DATATAG'): dataTag = d.get('DATATAG') scriptFile = os.path.join(scriptsdir,name+'_'+ dataTag +'.sh') scriptDict["script"].append(scriptFile) script = open(scriptFile, 'w') if reducer: print >>script, t.safe_substitute(d) else: print >>script, '#!/bin/bash' print >>script, "echo ==========start %s at : `date` ========== %s" % name print >>script, t.safe_substitute(d) # print >>script, "\n" print >>script, "echo ==========end %s at : `date` ========== %s" % name script.close() return scriptDict
def pycommand(self, func, *args, **kwargs): '''Creates the command to run the given python function, passing it the given arguments. The function (func) must be picklable by the python pickle package. Basically, this means it must be a global function within a module. The arguments must be JSON-serializable. The func is required to accept the arrayParamValues element for this task as its first argument (always a dictionary object), and any additional parameters must be passed to the pycommand function. For non-array steps, the arrayParamValues element is still passed, but is a dictionary with no keys.''' funcString = self._escapetriplequote(pickle.dumps(func)) argBundle = bundle(args=args, kwargs=kwargs) argString = self._escapetriplequote(json.dumps(argBundle, indent=2)) lines = [ '#! /usr/bin/env python', 'import sys', 'sys.path = %s' % str(sys.path), 'import os, pickle, sys, wfclib.jsonutil', 'from cgiutils import bundle', 'func = pickle.loads("""%s""")' % funcString, 'argBundle = wfclib.jsonutil.loads("""%s""")' % argString, 'apv = bundle()', 'apvKeys = os.getenv("CGI_ARRAY_PARAM_NAMES")', 'if apvKeys is not None:', ' for key in apvKeys.split(":"):', ' apv[key] = os.getenv(key)', 'func(apv, *argBundle.args, **argBundle.kwargs)', ] script = '\n'.join(lines) return "python -u - <<'@CGI_PYCOMMAND_HERE_DOC_DELIM'\n%s\n@CGI_PYCOMMAND_HERE_DOC_DELIM" % script
def write_Scripts(self, name, scriptsdir, commands, JobParamList=[], paramDict={}, reducer=True): scriptDict = bundle() t = _generate_template(commands) scriptDict["script"] = [] for n, d in enumerate(JobParamList): if paramDict: d.update(paramDict) dataTag = str(n) if d.get('DATATAG'): dataTag = d.get('DATATAG') scriptFile = os.path.join(scriptsdir, name + '_' + dataTag + '.sh') scriptDict["script"].append(scriptFile) script = open(scriptFile, 'w') if reducer: print >> script, t.safe_substitute(d) else: print >> script, '#!/bin/bash' print >> script, "echo ==========start %s at : `date` ========== %s" % name print >> script, t.safe_substitute(d) # print >>script, "\n" print >> script, "echo ==========end %s at : `date` ========== %s" % name script.close() return scriptDict
def pycommand(self, func, *args, **kwargs): '''Creates the command to run the given python function, passing it the given arguments. The function (func) must be picklable by the python pickle package. Basically, this means it must be a global function within a module. The arguments must be JSON-serializable. The func is required to accept the arrayParamValues element for this task as its first argument (always a dictionary object), and any additional parameters must be passed to the pycommand function. For non-array steps, the arrayParamValues element is still passed, but is a dictionary with no keys.''' funcString = self._escapetriplequote(pickle.dumps(func)) argBundle = bundle(args=args, kwargs=kwargs) argString = self._escapetriplequote(json.dumps(argBundle,indent=2)) lines = [ '#! /usr/bin/env python', 'import sys', 'sys.path = %s' % str(sys.path), 'import os, pickle, sys, wfclib.jsonutil', 'from cgiutils import bundle', 'func = pickle.loads("""%s""")' % funcString, 'argBundle = wfclib.jsonutil.loads("""%s""")' % argString, 'apv = bundle()', 'apvKeys = os.getenv("CGI_ARRAY_PARAM_NAMES")', 'if apvKeys is not None:', ' for key in apvKeys.split(":"):', ' apv[key] = os.getenv(key)', 'func(apv, *argBundle.args, **argBundle.kwargs)', ] script = '\n'.join(lines) return "python -u - <<'@CGI_PYCOMMAND_HERE_DOC_DELIM'\n%s\n@CGI_PYCOMMAND_HERE_DOC_DELIM" % script
def run(self, impl, dependList): impl.log.info("step: ubammerge!") inputInfo = self.results[dependList[0]].output result = bundle(output=bundle(), script=bundle()) sampleName = self.option.multiSampleName merge_tmp = impl.mkdir(self.option.workdir, "temp", sampleName, 'ubammerge.bam') # extend program path self.ubammerge.program = self.expath('ubammerge.program') # script template fs_cmd = self.fs_cmd cmd = [] # cmd.append("%s ${OUTDIR}/" % fs_cmd.delete) # cmd.append("%s ${INPUT}/*/_SUCCESS ${INPUT}/*/_logs" % fs_cmd.delete) scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName) hdfs_outputPath = os.path.join(self.option.dirHDFS, sampleName, 'ubammerge_output') bams = [] for sample_name in inputInfo: sample_input = inputInfo[sample_name] for dataTag in sample_input: bams.append(sample_input[dataTag]['bam']) if len(bams) <= 1: merge_tmp = bams[0] else: input_bam_command = '' for input_bam in bams: input_bam_command += "--UNMAPPED_BAM {} ".format(input_bam) cmd.append('%s MergeBamAlignment %s -O %s -R %s' % (self.ubammerge.program, input_bam_command, merge_tmp, self.ref.normal.ref)) cmd.append('%s fs -mkdir -p %s' % (self.hadoop.bin, hdfs_outputPath)) cmd.append('%s fs -put %s %s' % (self.hadoop.bin, merge_tmp, hdfs_outputPath)) # write script scriptPath = \ impl.write_shell( name='ubammerge', scriptsdir=scriptsdir, commands=cmd, paramDict=[]) # result result.output[sampleName] = hdfs_outputPath result.script[sampleName] = scriptPath return result
def service(self, name, serviceCommand, serviceData, concurrencyLimit=None, hog=False): if concurrencyLimit is None: concurrencyLimit = 1200 self._checkname(name) self.services.append(bundle(name=name, serviceCommand=serviceCommand, serviceData=serviceData, concurrencyLimit=concurrencyLimit, hog=hog)) return name
def run(self, impl,dependList): impl.log.info("step: rmdup!") inputInfo = self.results[dependList[0]].output result = bundle(output=bundle(),script=bundle()) #extend program path self.rmdup.program = self.expath('rmdup.program') if self.init.get('isSE'): self.rmdup.parameter = self.rmdup.parameter_SE hadoop_parameter = '' if self.hadoop.get('queue'): hadoop_parameter += '-D mapreduce.job.queuename={}'.format(self.hadoop.queue) #script template fs_cmd = self.fs_cmd cmd = [] cmd.append("%s ${OUTDIR}/" % fs_cmd.delete ) cmd.append("%s ${INPUT}/*/_SUCCESS ${INPUT}/*/_logs" % fs_cmd.delete ) cmd.append("${PROGRAM} %s -i ${INPUT} -o ${OUTDIR} -R ${REDUCERNUM} ${PARAM}" % hadoop_parameter) for sampleName in inputInfo: scriptsdir = impl.mkdir(self.gaeaScriptsDir,sampleName) hdfs_outputPath = os.path.join(self.option.dirHDFS,sampleName,'rmdup_output') #global param ParamDict = { "PROGRAM": "%s jar %s MarkDuplicate" % (self.hadoop.bin, self.rmdup.program), "INPUT": inputInfo[sampleName], "OUTDIR": hdfs_outputPath, "REDUCERNUM":self.hadoop.reducer_num, "PARAM":self.rmdup.parameter } #write script scriptPath = \ impl.write_shell( name = 'rmdup', scriptsdir = scriptsdir, commands=cmd, paramDict=ParamDict) #result result.output[sampleName] = os.path.join(hdfs_outputPath,'Mark') result.script[sampleName] = scriptPath return result
def run(self, impl,dependList): impl.log.info("step: clean!") inputInfo = self.results[dependList[0]].output result = bundle(output=bundle(),script=bundle()) #extend program path #self.clean.program = self.expath('clean.program') #script template fs_cmd = self.fs_cmd cmd = [] cmd.append("source %s/bin/activate" % self.GAEA_HOME) cmd.append("check.py -s %s/state.json -n ${SAMPLE} -t %s %s" % (self.stateDir, ','.join(dependList), self.init.check_state_param)) cmd.append("if [ $? = 0 ];then") cmd.append("%s %s/${SAMPLE}" % (fs_cmd.delete, self.option.dirHDFS)) if self.init.check_state_param: cmd.append("${CPVCF}") cmd.append("fi") JobParamList = [] for sampleName in self.sample: scriptsdir = impl.mkdir(self.gaeaScriptsDir,sampleName) vcf = '' for step in dependList: vcf_tmp = self.results[step].output[sampleName] if os.path.basename(vcf_tmp).find('vcf') != -1: vcf = vcf_tmp break #global param JobParamList.append({ "SAMPLE" : sampleName, "SCRDIR" : scriptsdir, "CPVCF" : "cp %s /ldfssz1/ST_HEALTH/WGS/project/3000members_hg38/vcf/" % vcf if vcf else '' }) #write script scriptPath = \ impl.write_scripts( name = 'clean', commands=cmd, JobParamList=JobParamList) #result result.script.update(scriptPath) return result
def run(self, impl, dependList): impl.log.info("step: BwaMarkDupSpark!") inputInfo = self.results[dependList[0]].output result = bundle(output=bundle(), script=bundle()) # extend program path self.BwaMarkDupSpark.program = self.expath('BwaMarkDupSpark.program') spark_param = self.BwaMarkDupSpark.parameter_spark if self.hadoop.get('queue'): spark_param = impl.paramCheck(True, spark_param, '--queue', self.hadoop.queue) # script template fs_cmd = self.fs_cmd cmd = [] cmd.append("%s ${OUTPUT}" % fs_cmd.delete) cmd.append("%s ${INPUT}/*/_SUCCESS ${INPUT}/*/_logs" % fs_cmd.delete) cmd.append("${PROGRAM} BwaAndMarkDuplicatesPipelineSpark -I ${INPUT} -O ${OUTPUT} -R ${REF} ${PARAM} -- ${PARAMSPARK}") for sampleName in inputInfo: scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName) hdfs_outputPath = os.path.join(self.option.dirHDFS, sampleName, 'BwaMarkDupSpark_output') # global param ParamDict = { "PROGRAM": self.BwaMarkDupSpark.program, "INPUT": inputInfo[sampleName], "OUTPUT": hdfs_outputPath, "REF": self.ref.normal.ref, "PARAM": self.BwaMarkDupSpark.parameter, "PARAMSPARK": spark_param } # write script scriptPath = \ impl.write_shell( name='BwaMarkDupSpark', scriptsdir=scriptsdir, commands=cmd, paramDict=ParamDict) # result result.output[sampleName] = hdfs_outputPath result.script[sampleName] = scriptPath return result
def parse_userjson(self,jsonfile): userConf = bundle() with open(jsonfile, 'r') as uf: data = uf.read() try: userConf = clean(json.loads(data)) self.extendcfg(userConf) except Exception,e: print Exception,"%s, "%self.config,e
def run(args, state): analysisDict = state.analysisDict sampleName = args.sampleName logger = Logger(os.path.join(state.scriptsDir, 'log'), '1', 'gaeaJobMonitor', False).getlog() isComplete = bundle() all_done = True jobList = args.jobs.split(',') if jobList[0] == 'init': if not state.results['init'].get('script'): jobList = jobList[1:] for num, step in enumerate(jobList): if analysisDict[step].platform == 'S': continue n = state.analysisList.index(step) if state.analysisList[0] != 'init': n += 1 script = state.results[step]['script'][sampleName] if num > 0: for depStep in analysisDict[step].depend: if not isComplete[depStep]: isComplete[step] = False break if isComplete.has_key(step) and isComplete[step] == False: logger.warning('%s - step %d: %s failed' % (sampleName, n, step)) continue printtime('step: %s start...' % step) p = subprocess.Popen('sh %s' % script, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) isComplete[step] = check_log(p, script, sampleName, n, step) if isComplete[step] or step == 'alignment': if step == 'alignment': isComplete[step] = True printtime("step: %s complete" % step) logger.info('%s - step %d: %s complete' % (sampleName, n, step)) out_fh = open(script + '.o', 'w') for line in p.stdout.readlines(): print >> out_fh, line[:-1] p.wait() else: all_done = False printtime("%s failed" % step) logger.warning('%s - step %d: %s failed' % (sampleName, n, step)) if p.returncode == None: p.kill() return all_done
def parse(self,user_config=''): configInfo = bundle() if user_config: if user_config.endswith('.json') or user_config.endswith('config.json'): configInfo = self.parse_userjson(user_config) else: try: configInfo = self.parse_usercfg(user_config) except Exception,e: print Exception,"%s, "%user_config,e
def run(args): binPath = os.path.split(os.path.realpath(__file__))[0] os.environ['GAEA_HOME'] = os.path.split(binPath)[0] createVar = locals() defaultConfig = os.path.join(os.environ['GAEA_HOME'],'config','default.json') usercfg = bundle() try: usercfg = ParseConfig(defaultConfig).parse(args.config) except Exception,e: raise RuntimeError("Parse UserConfig failed," + repr(e) + "\n")
def getAnalysisDict(analysis_flow): graph = bundle(init=bundle()) graph['init']['depend'] = [] graph['init']['platform'] = 'H' for stepList in analysis_flow: if not graph.has_key(stepList[1]): graph[stepList[1]] = bundle() graph[stepList[1]]['depS'] = False if len(stepList) == 2: graph[stepList[1]]['depend'] = ['init'] graph[stepList[1]]['platform'] = stepList[0].upper() else: graph[stepList[1]]['depend'] = stepList[2].split(',') graph[stepList[1]]['platform'] = stepList[0].upper() for dep in graph[stepList[1]]['depend']: if graph[dep]['platform'].upper() == 'S': graph[stepList[1]]['depS'] = True return graph
def run(args): binPath = os.path.split(os.path.realpath(__file__))[0] os.environ['GAEA_HOME'] = os.path.split(binPath)[0] createVar = locals() defaultConfig = os.path.join(os.environ['GAEA_HOME'], 'config', 'default.json') usercfg = bundle() try: usercfg = ParseConfig(defaultConfig).parse(args.config) except Exception, e: raise RuntimeError("Parse UserConfig failed," + repr(e) + "\n")
def check_ref_type(ref): if not ref.has_key('normal'): ref.normal = bundle() if not ref.has_key('male'): ref.male = bundle() if not ref.has_key('female'): ref.female = bundle() if ref.male.get('ref') and ref.female.get('ref'): logger.info('male.ref: %s, female:%s. use gender mode!' %(ref.male.ref,ref.female.ref) ) ref['gender_mode'] = 'both' ref.normal.rupdate(ref.male) if ref.normal.get('ref') and ref.female.get('ref') and not ref.male.get('ref'): # logger.warning("male ref don't exists! use normal as male.") ref['gender_mode'] = 'both' ref.male.rupdate(ref.normal) if ref.normal.get('ref') and not ref.female.get('ref') and not ref.male.get('ref'): # logger.warning("male and female ref don't exists! use normal mode!") ref['gender_mode'] = 'normal'
def run(args,state): analysisDict = state.analysisDict sampleName = args.sampleName logger = Logger(os.path.join(state.scriptsDir,'log'),'1','gaeaJobMonitor',False).getlog() isComplete = bundle() all_done = True jobList = args.jobs.split(',') if jobList[0] == 'init': if not state.results['init'].get('script'): jobList = jobList[1:] for num,step in enumerate(jobList): if analysisDict[step].platform == 'S': continue n = state.analysisList.index(step) if state.analysisList[0] != 'init': n += 1 script = state.results[step]['script'][sampleName] if num > 0: for depStep in analysisDict[step].depend: if not isComplete[depStep]: isComplete[step] = False break if isComplete.has_key(step) and isComplete[step] == False: logger.warning('%s - step %d: %s failed' % (sampleName, n, step)) continue printtime('step: %s start...' % step) p = subprocess.Popen('sh %s' % script, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) isComplete[step] = check_log(p,script,sampleName,n, step) if isComplete[step] or step == 'alignment': if step == 'alignment': isComplete[step] = True printtime("step: %s complete" % step) logger.info('%s - step %d: %s complete' % (sampleName, n, step)) out_fh = open(script+'.o', 'w') for line in p.stdout.readlines(): print >>out_fh, line[:-1] p.wait() else: all_done = False printtime("%s failed" % step) logger.warning('%s - step %d: %s failed' % (sampleName, n, step)) if p.returncode == None: p.kill() return all_done
def write_shell(self, name, scriptsdir, commands, JobParamList=[], paramDict={}): scriptDict = bundle() t = _generate_template(commands) scriptFile = os.path.join(scriptsdir,name+'.sh') script = open(scriptFile, 'w') print >>script, '#!/bin/bash' print >>script, "echo ==========start %s at : `date` ==========" % name _script_append(script, t, JobParamList, paramDict) print >>script, "" print >>script, "echo ==========end %s at : `date` ========== " % name script.close() return scriptFile
def run(self, impl,dependList): impl.log.info("step: rmdup_spark!") inputInfo = self.results[dependList[0]].output result = bundle(output=bundle(),script=bundle()) #extend program path self.rmdup_spark.program = self.expath('rmdup_spark.program') #script template fs_cmd = self.fs_cmd cmd = [] cmd.append("%s ${OUTDIR}/" % fs_cmd.delete ) cmd.append("%s ${INPUT}/*/_SUCCESS ${INPUT}/*/_logs" % fs_cmd.delete ) cmd.append("spark-submit --class org.bgi.flexlab.gaea.spark.example.DuplicationMark --master yarn --num-executors 48 --driver-memory 8g --executor-memory 25g --executor-cores 4 --queue spark_queue ${PROGRAM} -I ${INPUT} -O ${OUTDIR} ${PARAM}") for sampleName in inputInfo: scriptsdir = impl.mkdir(self.gaeaScriptsDir,sampleName) hdfs_outputPath = os.path.join(self.option.dirHDFS,sampleName,'rmdup_spark_output') #global param ParamDict = { "PROGRAM":self.rmdup_spark.program, "INPUT": inputInfo[sampleName], "OUTDIR": hdfs_outputPath, "PARAM":self.rmdup_spark.parameter } #write script scriptPath = \ impl.write_shell( name = 'rmdup_spark', scriptsdir = scriptsdir, commands=cmd, paramDict=ParamDict) #result result.output[sampleName] = os.path.join(hdfs_outputPath,'Mark') result.script[sampleName] = scriptPath return result
def bundle_rcopy(cfg): newdict = bundle() for entry in cfg: this_entry = cfg[entry] if isinstance(this_entry, dict): this_entry = bundle_rcopy(this_entry) elif isinstance(this_entry, list): # create a copy rather than a reference this_entry = list(this_entry) elif isinstance(this_entry, tuple): # create a copy rather than a reference this_entry = tuple(this_entry) newdict[entry] = this_entry return newdict