Exemplo n.º 1
0
    def run(self, impl, dependList):
        impl.log.info("step: jointcalling!")
        inputInfo = self.results[dependList[0]].output
        result = bundle(output=bundle(), script=bundle())

        hadoop_parameter = ''
        if self.hadoop.get('queue'):
            hadoop_parameter += ' -D mapreduce.job.queuename={} '.format(
                self.hadoop.queue)

        # extend program path
        self.jointcalling.program = self.expath('jointcalling.program')

        # global param
        ParamDict = self.file.copy()
        ParamDict.update({
            "PROGRAM":
            "%s jar %s JointCalling %s" %
            (self.hadoop.bin, self.jointcalling.program, hadoop_parameter),
            "REF":
            "file://%s" % self.ref.normal.gaeaIndex,
            "REDUCERNUM":
            self.hadoop.reducer_num
        })

        # script template
        fs_cmd = self.fs_cmd
        cmd = []
        cmd.append("%s ${INPUT}/_*" % fs_cmd.delete)
        cmd.append("%s ${OUTDIR}" % fs_cmd.delete)
        cmd.append(
            "${PROGRAM} -i ${INPUT} -o ${OUTDIR} -r ${REF} -n ${REDUCERNUM} %s"
            % self.jointcalling.parameter)

        JobParamList = []
        for sampleName in inputInfo:
            scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName)
            hdfs_outputPath = os.path.join(self.option.dirHDFS, sampleName,
                                           'jointcalling_output')
            result.output[sampleName] = os.path.join(hdfs_outputPath, 'vcf')

            # global param
            JobParamList.append({
                "SAMPLE": sampleName,
                "SCRDIR": scriptsdir,
                "INPUT": inputInfo[sampleName],
                "OUTDIR": hdfs_outputPath
            })

        # write script
        scriptPath = \
            impl.write_scripts(
                name='jointcalling',
                commands=cmd,
                JobParamList=JobParamList,
                paramDict=ParamDict)

        # result
        result.script.update(scriptPath)
        return result
Exemplo n.º 2
0
class clean(Workflow):
    """ clean """

    INIT = bundle(clean=bundle())
    INIT.clean.program = "/szhwfs1/ST_HEALTH/GENOME_APP/F16ZQSB1SY2582/personalgenome/lib/genome_api_for_gaea.pl"
    INIT.clean.parameter = ''

    def run(self, impl, dependList):
        impl.log.info("step: clean!")
        inputInfo = self.results[dependList[0]].output
        result = bundle(output=bundle(), script=bundle())

        #extend program path
        #self.clean.program = self.expath('clean.program')

        #script template
        fs_cmd = self.fs_cmd
        cmd = []
        cmd.append("source %s/bin/activate" % self.GAEA_HOME)
        cmd.append(
            "check.py -s %s/state.json -n ${SAMPLE} -t %s %s" %
            (self.stateDir, ','.join(dependList), self.init.check_state_param))
        cmd.append("if [ $? = 0 ];then")
        cmd.append("%s %s/${SAMPLE}" % (fs_cmd.delete, self.option.dirHDFS))
        if self.init.check_state_param:
            cmd.append("${CPVCF}")
        cmd.append("fi")

        JobParamList = []
        for sampleName in self.sample:
            scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName)
            vcf = ''
            for step in dependList:
                vcf_tmp = self.results[step].output[sampleName]
                if os.path.basename(vcf_tmp).find('vcf') != -1:
                    vcf = vcf_tmp
                    break

            #global param
            JobParamList.append({
                "SAMPLE":
                sampleName,
                "SCRDIR":
                scriptsdir,
                "CPVCF":
                "cp %s /ldfssz1/ST_HEALTH/WGS/project/3000members_hg38/vcf/" %
                vcf if vcf else ''
            })

        #write script
        scriptPath = \
        impl.write_scripts(
                name = 'clean',
                commands=cmd,
                JobParamList=JobParamList)

        #result
        result.script.update(scriptPath)
        return result
Exemplo n.º 3
0
    def run(self, impl, dependList):
        impl.log.info("step: realignment!")
        inputInfo = self.results[dependList[0]].output
        result = bundle(output=bundle(),script=bundle())
        
        #extend program path
        self.realignment.program = self.expath('realignment.program')
        
        if self.option.multiSample:
            self.realignment.parameter += " --multiSample"

        fs_type = 'file://'
        if self.hadoop.input_format == 'hdfs':
            fs_type = ''

        #global param
        ParamDict = self.file.copy()
        ParamDict.update({
                "PROGRAM": "%s jar %s Realigner" % (self.hadoop.bin, self.realignment.program),
                "REF": "file://%s" % self.ref.normal.gaeaIndex,
                "REDUCERNUM":self.hadoop.reducer_num
            })
            
        hadoop_parameter = ''
        if self.hadoop.get('queue'):
            hadoop_parameter += ' -D mapreduce.job.queuename={} '.format(self.hadoop.queue)
        #script template    
        fs_cmd = self.fs_cmd
        cmd = []
        cmd.append("%s ${OUTDIR}" % fs_cmd.delete )
        cmd.append("%s ${INPUT}/_*" % fs_cmd.delete )
        cmd.append("${PROGRAM} %s -i ${INPUT} -o ${OUTDIR} -r ${REF} -n ${REDUCERNUM} %s" % (hadoop_parameter, self.realignment.parameter))
        
        JobParamList = []
        for sampleName in inputInfo:
            scriptsdir = impl.mkdir(self.gaeaScriptsDir,sampleName)
            hdfs_outputPath = os.path.join(self.option.dirHDFS,sampleName,'realignment_output')
            
            #global param
            JobParamList.append({
                    "SAMPLE" : sampleName,
                    "SCRDIR" : scriptsdir,
                    "INPUT": inputInfo[sampleName],
                    "OUTDIR": hdfs_outputPath
                })
                    
            result.output[sampleName] = os.path.join(hdfs_outputPath,'fixmate')
            
        #write script
        scriptPath = \
        impl.write_scripts(
                name = 'realignment',
                commands=cmd,
                JobParamList=JobParamList,
                paramDict=ParamDict)
    
        #result
        result.script.update(scriptPath) 
        return result
Exemplo n.º 4
0
class baserecal_spark(Workflow):
    """ baserecal_spark """

    INIT = bundle(baserecal_spark=bundle())
    INIT.baserecal_spark.bqsr = "/ifs4/ISDC_BD/huweipeng/project/BQSR/GaeaRecalibrationSpark.jar"
    INIT.baserecal_spark.parameter = "-v file:///ifs4/ISDC_BD/GaeaProject/resource/dbsnp_135.hg19.modify.vcf"

    def run(self, impl, dependList):
        impl.log.info("step: baserecal_spark!")
        inputInfo = self.results[dependList[0]].output
        result = bundle(output=bundle(),script=bundle())
        
        #extend program path
        self.baserecal_spark.bqsr = self.expath('baserecal_spark.bqsr')

        if self.option.multiSample:
            self.baserecal_spark.parameter += " -MutiSample "
            
        #global param
        ParamDict = self.file.copy()
        ParamDict.update({
                "PROGRAM_BQSR": "spark-submit --master yarn --num-executors 192 --executor-cores 1 --executor-memory 6g %s -n 2000" % self.baserecal_spark.bqsr,
                "REF": "file://%s" % self.ref.normal.gaeaIndex
            })
        
        #script template    
        fs_cmd = self.fs_cmd
        cmd = []
        cmd.append("%s ${INPUT}/_*" % fs_cmd.delete )
        cmd.append("%s ${OUTDIR_BQSR}" % fs_cmd.delete)
        cmd.append("${PROGRAM_BQSR} -i ${INPUT} -o ${OUTDIR_BQSR} --ref ${REF} %s" %self.baserecal_spark.parameter)

        JobParamList = []
        for sampleName in inputInfo:
            scriptsdir = impl.mkdir(self.gaeaScriptsDir,sampleName)
            hdfs_outputPath = os.path.join(self.option.dirHDFS,sampleName,'baserecal_spark_output')
            
            #global param
            JobParamList.append({
                    "SAMPLE" : sampleName,
                    "SCRDIR" : scriptsdir,
                    "INPUT": inputInfo[sampleName],
                    "OUTDIR_BQSR": hdfs_outputPath
                })
            
            result.output[sampleName] = hdfs_outputPath
            
        #write script
        scriptPath = \
        impl.write_scripts(
                name = 'baserecal_spark',
                commands=cmd,
                JobParamList=JobParamList,
                paramDict=ParamDict)
    
        #result
        result.script.update(scriptPath) 
        return result
                                
Exemplo n.º 5
0
class cnv(Workflow):
    """ cnv """

    INIT = bundle(cnv=bundle())
    INIT.cnv.program = ""
    INIT.cnv.parameter = ""

    def run(self, impl, dependList):
        impl.log.info("step: cnv!")
        # depend bamQC
        inputInfo = self.results[dependList[0]].output
        result = bundle(script=bundle())
        
        multi_sample = self.option.multiSampleName
        scriptsdir = impl.mkdir(self.option.workdir,"scripts",'standalone',multi_sample)
        
        #extend program path
        self.cnv.program = self.expath('cnv.program')
                
        temp = impl.mkdir(self.option.workdir,'temp') 
        annolist = os.path.join(temp,'anno_depth.list')
        with open(annolist,'w') as f:
            if self.option.multiSample:
                for sample in self.sample:
                    anno_region = os.path.join(inputInfo[multi_sample],'%s.anno_region.txt' % sample)
                    line = "%s\t%s\n" % (sample,anno_region)
                    f.write(line)
            else:
                for sampleName in inputInfo:
                    anno_region = os.path.join(inputInfo[sampleName],'%s.anno_region.txt' % sampleName)
                    line = "%s\t%s\n" % (sampleName,anno_region)
                    f.write(line)
                    
        _,output =  commands.getstatusoutput('perl %s/bin/require_config.pl %s' % (self.GAEA_HOME,self.file.annoProtoclConfig))
        config = eval(output)
        self.cnv.parameter += ' -trans %s' % config['trans']   
        
        #global param
        ParamDict = {
                "PROGRAM": "perl %s" % self.cnv.program,
                "OUTPUT" : impl.mkdir(self.option.workdir,'variation','cnv'),
                "ANNOLIST":annolist,
                "SAMPLELIST": self.option.sampleList
            }
        
        #script template    
        cmd = ["${PROGRAM} -output ${OUTPUT} -QC ${ANNOLIST} -samplelist ${SAMPLELIST}  %s" %self.cnv.parameter]
    
        #write script
        scriptPath = \
        impl.write_shell(
                name = 'cnv',
                scriptsdir = scriptsdir,
                commands=cmd,
                paramDict=ParamDict)
    
        #result
        result.script[multi_sample] = scriptPath    
        return result
Exemplo n.º 6
0
class ubammerge(Workflow):
    """ ubammerge """

    INIT = bundle(ubammerge=bundle())
    INIT.ubammerge.program = "/hwfssz1/BIGDATA_COMPUTING/software/source/gatk4/gatk"
    INIT.ubammerge.parameter = ''

    def run(self, impl, dependList):
        impl.log.info("step: ubammerge!")
        inputInfo = self.results[dependList[0]].output
        result = bundle(output=bundle(), script=bundle())
        sampleName = self.option.multiSampleName
        merge_tmp = impl.mkdir(self.option.workdir, "temp", sampleName,
                               'ubammerge.bam')

        # extend program path
        self.ubammerge.program = self.expath('ubammerge.program')

        # script template
        fs_cmd = self.fs_cmd
        cmd = []
        # cmd.append("%s ${OUTDIR}/" % fs_cmd.delete)
        # cmd.append("%s ${INPUT}/*/_SUCCESS ${INPUT}/*/_logs" % fs_cmd.delete)

        scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName)
        hdfs_outputPath = os.path.join(self.option.dirHDFS, sampleName,
                                       'ubammerge_output')
        bams = []
        for sample_name in inputInfo:
            sample_input = inputInfo[sample_name]
            for dataTag in sample_input:
                bams.append(sample_input[dataTag]['bam'])

        if len(bams) <= 1:
            merge_tmp = bams[0]
        else:
            input_bam_command = ''
            for input_bam in bams:
                input_bam_command += "--UNMAPPED_BAM {} ".format(input_bam)
            cmd.append('%s MergeBamAlignment %s -O %s -R %s' %
                       (self.ubammerge.program, input_bam_command, merge_tmp,
                        self.ref.normal.ref))
        cmd.append('%s fs -mkdir -p %s' % (self.hadoop.bin, hdfs_outputPath))
        cmd.append('%s fs -put %s %s' %
                   (self.hadoop.bin, merge_tmp, hdfs_outputPath))

        # write script
        scriptPath = \
            impl.write_shell(
                name='ubammerge',
                scriptsdir=scriptsdir,
                commands=cmd,
                paramDict=[])

        # result
        result.output[sampleName] = hdfs_outputPath
        result.script[sampleName] = scriptPath

        return result
Exemplo n.º 7
0
 def run(self, impl, dependList):
     impl.log.info("step: genotype!")
     inputInfo = self.results[dependList[0]].output
     result = bundle(output=bundle(),script=bundle())
     
     #extend program path
     self.genotype.program = self.expath('genotype.program')
     
     if not self.option.multiSample:
         if self.genotype.parameter.find('-noMultiSampleCall') != -1:
             impl.log.warning("Pipeline is in single sample mode, disable -noMultiSampleCall. (deleted)")
             self.genotype.parameter = self.genotype.parameter.replace('-noMultiSampleCall','')
             
     if self.file.get("regionVariation"):
         self.genotype.parameter += " -intervals file://%s " % self.file.regionVariation
     elif self.file.get("region"):
         self.genotype.parameter += " -intervals file://%s " % self.file.region
         
     #global param
     ParamDict = self.file.copy()
     ParamDict.update({
             "PROGRAM": "%s jar %s" % (self.hadoop.bin, self.genotype.program),
             "REF": "file://%s" % self.ref.normal.gaeaIndex,
             "REDUCERNUM":self.hadoop.reducer_num
         })
     
     #script template    
     fs_cmd = self.fs_cmd
     cmd = []
     cmd.append("%s ${INPUT}/_*" % fs_cmd.delete )
     cmd.append("%s ${OUTDIR}" % fs_cmd.delete )
     cmd.append("${PROGRAM} -input ${INPUT} -out ${OUTDIR} -ref ${REF} -reduceNum ${REDUCERNUM} %s" %self.genotype.parameter )
     
     JobParamList = []
     for sampleName in inputInfo:
         scriptsdir = impl.mkdir(self.gaeaScriptsDir,sampleName)
         hdfs_outputPath = os.path.join(self.option.dirHDFS,sampleName,'genotype_output')
         result.output[sampleName] = hdfs_outputPath
         
         #global param
         JobParamList.append({
                 "SAMPLE" : sampleName,
                 "SCRDIR" : scriptsdir,
                 "INPUT": inputInfo[sampleName],
                 "OUTDIR": hdfs_outputPath
             })
         
 
     #write script
     scriptPath = \
     impl.write_scripts(
             name = 'genotype',
             commands=cmd,
             JobParamList=JobParamList,
             paramDict=ParamDict)
 
     #result
     result.script.update(scriptPath)           
     return result
Exemplo n.º 8
0
    def run(self, impl, dependList):
        impl.log.info("step: realignment!")
        inputInfo = self.results[dependList[0]].output
        result = bundle(output=bundle(), script=bundle())

        #extend program path
        self.realignment.program = self.expath('realignment.program')

        if self.option.multiSample:
            self.realignment.parameter += " --mutiSample"

        #global param
        ParamDict = self.file.copy()
        ParamDict.update({
            "PROGRAM":
            "%s jar %s" % (self.hadoop.bin, self.realignment.program),
            "REF":
            "file://%s" % self.ref.normal.gaeaIndex,
            "REDUCERNUM":
            self.hadoop.reducer_num
        })

        #script template
        fs_cmd = self.fs_cmd
        cmd = []
        cmd.append("%s ${OUTDIR}" % fs_cmd.delete)
        cmd.append("%s ${INPUT}/_*" % fs_cmd.delete)
        cmd.append(
            "${PROGRAM} --align ${INPUT} --out ${OUTDIR} --ref ${REF} --reducer ${REDUCERNUM} %s"
            % self.realignment.parameter)

        JobParamList = []
        for sampleName in inputInfo:
            scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName)
            hdfs_outputPath = os.path.join(self.option.dirHDFS, sampleName,
                                           'realignment_output')

            #global param
            JobParamList.append({
                "SAMPLE": sampleName,
                "SCRDIR": scriptsdir,
                "INPUT": inputInfo[sampleName],
                "OUTDIR": hdfs_outputPath
            })

            result.output[sampleName] = os.path.join(hdfs_outputPath,
                                                     'FixMateResult')

        #write script
        scriptPath = \
        impl.write_scripts(
                name = 'realignment',
                commands=cmd,
                JobParamList=JobParamList,
                paramDict=ParamDict)

        #result
        result.script.update(scriptPath)
        return result
Exemplo n.º 9
0
 def parseState(self):
     f = open(self.config, 'r')
     state = bundle()
     try: 
         data = f.read()
         state = bundle(clean(json.loads(data)))
     except Exception,e:  
         print Exception,"%s, "%self.config,e
Exemplo n.º 10
0
 def run(self, impl, dependList):
     impl.log.info("step: baserecal!")
     inputInfo = self.results[dependList[0]].output
     result = bundle(output=bundle(),script=bundle())
     
     #extend program path
     self.baserecal.bqsr = self.expath('baserecal.bqsr')
     self.baserecal.printreads = self.expath('baserecal.printreads')
     
     if self.option.multiSample:
         self.baserecal.bqsr_param += " -MutiSample "
         
     #global param
     ParamDict = self.file.copy()
     ParamDict.update({
             "PROGRAM_BQSR": "%s jar %s" % (self.hadoop.bin, self.baserecal.bqsr),
             "PROGRAM_PR": "%s jar %s" % (self.hadoop.bin, self.baserecal.printreads),
             "REF": "file://%s" % self.ref.normal.gaeaIndex,
             "REDUCERNUM":self.hadoop.reducer_num
         })
     
     #script template    
     fs_cmd = self.fs_cmd
     cmd = []
     cmd.append("%s ${INPUT}/_*" % fs_cmd.delete )
     cmd.append("%s ${OUTDIR_BQSR}" % fs_cmd.delete )
     cmd.append("${PROGRAM_BQSR} -input ${INPUT} -output ${OUTDIR_BQSR} -ref ${REF} -n ${REDUCERNUM} %s" %self.baserecal.bqsr_param)
     cmd.append("sleep 10")
     cmd.append("%s ${OUTDIR_PR}" % fs_cmd.delete )
     cmd.append("${PROGRAM_PR} -i ${INPUT} -o ${OUTDIR_PR} -f ${REF} -b ${OUTDIR_BQSR}/result.grp %s" %self.baserecal.printreads_param)
     
     JobParamList = []
     for sampleName in inputInfo:
         scriptsdir = impl.mkdir(self.gaeaScriptsDir,sampleName)
         hdfs_outputPath = os.path.join(self.option.dirHDFS,sampleName,'basequalityrecal_output')
         
         #global param
         JobParamList.append({
                 "SAMPLE" : sampleName,
                 "SCRDIR" : scriptsdir,
                 "INPUT": inputInfo[sampleName],
                 "OUTDIR_BQSR": os.path.join(hdfs_outputPath,"gaeaoutdb"),
                 "OUTDIR_PR": os.path.join(hdfs_outputPath,"printreads")
             })
         
         result.output[sampleName] = os.path.join(hdfs_outputPath,'printreads','result')
         
     #write script
     scriptPath = \
     impl.write_scripts(
             name = 'baserecal',
             commands=cmd,
             JobParamList=JobParamList,
             paramDict=ParamDict)
 
     #result
     result.script.update(scriptPath) 
     return result
Exemplo n.º 11
0
    def run(self, impl, dependList):
        impl.log.info("step: merge_vcf!")
        inputInfo = self.results[dependList[0]].output
        result = bundle(output=bundle(), script=bundle())

        # extend program path
        self.merge_vcf.program = self.expath('merge_vcf.program')
        self.merge_vcf.bcftools = self.expath('merge_vcf.bcftools', False)

        # global param
        hadoop_parameter = ''
        if self.hadoop.get('queue'):
            hadoop_parameter += ' -D mapreduce.job.queuename={} '.format(
                self.hadoop.queue)

        ParamDict = {
            "PROGRAM": "%s jar %s" % (self.hadoop.bin, self.merge_vcf.program),
            "HADOOPPARAM": hadoop_parameter
        }

        JobParamList = []
        for sampleName in inputInfo:
            scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName)
            outputPath = impl.mkdir(self.option.workdir, "variation",
                                    sampleName)
            result.output[sampleName] = os.path.join(
                outputPath, "{}.hc.vcf.gz".format(sampleName))

            # global param
            JobParamList.append({
                "SAMPLE": sampleName,
                "SCRDIR": scriptsdir,
                "INPUT": inputInfo[sampleName],
                "VCF": result.output[sampleName]
            })

        cmd = [
            "%s ${INPUT}/_*" % self.fs_cmd.delete,
            '${PROGRAM} SortVcf ${HADOOPPARAM} -input ${INPUT} -output file://${VCF}\n'
        ]

        if self.merge_vcf.bcftools:
            cmd.append(
                "%s index %s ${VCF}" %
                (self.merge_vcf.bcftools, self.merge_vcf.bcftools_param))

        # write script
        scriptPath = \
            impl.write_scripts(
                name='merge_vcf',
                commands=cmd,
                JobParamList=JobParamList,
                paramDict=ParamDict)

        # result
        result.script.update(scriptPath)
        return result
Exemplo n.º 12
0
    def run(self, impl, dependList):
        impl.log.info("step: haplotypeCaller!")
        inputInfo = self.results[dependList[0]].output
        result = bundle(output=bundle(), script=bundle())

        # extend program path
        self.haplotypeCaller.program = self.expath('haplotypeCaller.program')

        if self.file.get("regionVariation"):
            self.haplotypeCaller.parameter += " -L %s " % self.file.regionVariation
        elif self.file.get("region"):
            self.haplotypeCaller.parameter += " -L %s " % self.file.region

        # global param
        ParamDict = self.file.copy()
        ParamDict.update({
            "PROGRAM":
            "/home/huangzhibo/java -jar {} -T HaplotypeCaller ".format(
                self.haplotypeCaller.program),
            "REF":
            self.ref.normal.ref
        })

        # script template
        cmd = [
            "${PROGRAM} -I ${INPUT} -o ${OUTDIR} -R ${REF} %s" %
            self.haplotypeCaller.parameter
        ]

        JobParamList = []
        for sampleName in inputInfo:
            scriptsdir = impl.mkdir(self.option.workdir, "scripts",
                                    'standalone', sampleName)
            outputPath = impl.mkdir(self.option.workdir, "variation",
                                    'haplotypeCaller', sampleName)
            result.output[sampleName] = os.path.join(
                outputPath, "{}.hc.vcf.gz".format(sampleName))

            # global param
            JobParamList.append({
                "SAMPLE": sampleName,
                "SCRDIR": scriptsdir,
                "INPUT": inputInfo[sampleName],
                "OUTDIR": result.output[sampleName]
            })

        # write script
        scriptPath = \
            impl.write_scripts(
                name='haplotypeCaller',
                commands=cmd,
                JobParamList=JobParamList,
                paramDict=ParamDict)

        # result
        result.script.update(scriptPath)
        return result
Exemplo n.º 13
0
    def run(self, impl, dependList):
        impl.log.info("step: evaSNP!")
        inputInfo = self.results[dependList[0]].output
        result = bundle(output=bundle(), script=bundle())

        #extend program path
        self.evaSNP.program = self.expath('evaSNP.program')

        if not self.evaSNP.chip_vcf:
            impl.log.error("No chipVCF value for evaSNP step. please set it.")

        #script template
        cmd = []
        cmd.append(
            "${PROGRAM} ${PARAM} -c ${REFVCF} -r ${VCF} ${DBSNP} -o ${OUTPUT}")
        cmd.append(
            'if [ $? -ne 0 ]; then\n\techo "[WARNING]  ${SAMPLE} - evaSNP failed." >> %s\n\texit 1\nelse'
            % self.logfile)
        cmd.append(
            '\techo "[INFO   ]  ${SAMPLE} - evaSNP complete." >> %s\n\texit 1\nfi'
            % self.logfile)

        dbsnp = ''
        if self.evaSNP.dbsnp:
            dbsnp = "-d %s" % self.evaSNP.dbsnp

        for sampleName in inputInfo:
            scriptsdir = impl.mkdir(self.option.workdir, "scripts",
                                    'standalone', sampleName)
            vcf = inputInfo[sampleName]
            outdir = impl.mkdir(self.option.workdir, "QC", 'evaVCF',
                                sampleName)
            output = os.path.join(outdir, 'evaSNP.txt')
            ParamDict = {
                "SAMPLE": sampleName,
                "PROGRAM": "perl %s" % self.evaSNP.program,
                "REFVCF": self.evaSNP.chip_vcf,
                "VCF": vcf,
                "DBSNP": dbsnp,
                "OUTPUT": output,
                "PARAM": self.evaSNP.parameter
            }

            #write script
            scriptPath = \
            impl.write_shell(
                    name = 'evaSNP',
                    scriptsdir = scriptsdir,
                    commands=cmd,
                    paramDict=ParamDict)

            #result
            result.output[sampleName] = output
            result.script[sampleName] = scriptPath

        return result
Exemplo n.º 14
0
 def run(self, impl, dependList):
     '''
     dependList是该步骤的依赖步骤列表,如['S','HelloWorld','bamSort'],则dependList==['bamSort']
     self.results是一个包装了的字典类型(bundle,可通过'.'取值),其中存储了各步骤的输出信息, 如下
     self.results = \
     {
         "bamSort": {
             "output": {
                 "sample1": "/path/sample1.bam", 
                 "sample2": "/path/sample2.bam"
             }, 
             "script": {
                 "sample1": "/path/sample1/bamSort.sh", 
                 "sample2": "/path/sample2/bamSort.sh"
             }
         },
         ...
     }
     从self.results中获取bamSort步骤的输出信息:inputInfo = self.results.bamSort.output
     '''
     
     impl.log.info("step: HelloWorld!")
     inputInfo = self.results[dependList[0]].output
     
     #result 定义返回值,将被赋值给 self.results.HelloWorld, 其中script必须设置用以提交任务,output如果不设置则该APP不能被依赖
     result = bundle(output=bundle(),script=bundle()) 
     
     #extend program path (get abs path)
     self.HelloWorld.program = self.expath('HelloWorld.program')
     
     #script template  生成脚本,cmd是个列表,每个值生成shell脚本的一行,${XXX}将被ParamDict中的值替换
     cmd = []
     cmd.append('%s index ${PARAM} ${INPUT}' % self.HelloWorld.program)
     cmd.append('echo "Hello World!"')
         
     for sampleName in inputInfo:
         scriptsdir = impl.mkdir(self.scriptsDir,'standalone',sampleName)
         
         ParamDict = {
                 "INPUT": inputInfo[sampleName],
                 "PARAM":self.HelloWorld.parameter
             }
         
         #write script
         scriptPath = \
         impl.write_shell(
                 name = 'HelloWorld',
                 scriptsdir = scriptsdir,
                 commands=cmd,
                 paramDict=ParamDict)
         
         #result
         result.output[sampleName] = inputInfo[sampleName]
         result.script[sampleName] = scriptPath
     return result
                             
Exemplo n.º 15
0
def get_SGE_state(jobId):
    sge = bundle()
    f = open(jobId,'r')
    for line in f:
        line = line.strip()
        field = line.split('\t')
        if not sge.has_key(field[0]):
            sge[field[0]] = bundle()
        sge[field[0]][field[1]] = field[2]
    return sge
Exemplo n.º 16
0
def get_SGE_state(jobId):
    sge = bundle()
    f = open(jobId, 'r')
    for line in f:
        line = line.strip()
        field = line.split('\t')
        if not sge.has_key(field[0]):
            sge[field[0]] = bundle()
        sge[field[0]][field[1]] = field[2]
    return sge
Exemplo n.º 17
0
 def run(self, impl, dependList):
     '''
     dependList是该步骤的依赖步骤列表,如['S','HelloWorld','bamSort'],则dependList==['bamSort']
     self.results是一个包装了的字典类型(bundle,可通过'.'取值),其中存储了各步骤的输出信息, 如下
     self.results = \
     {
         "bamSort": {
             "output": {
                 "sample1": "/path/sample1.bam", 
                 "sample2": "/path/sample2.bam"
             }, 
             "script": {
                 "sample1": "/path/sample1/bamSort.sh", 
                 "sample2": "/path/sample2/bamSort.sh"
             }
         },
         ...
     }
     从self.results中获取bamSort步骤的输出信息:inputInfo = self.results.bamSort.output
     '''
     
     impl.log.info("step: HelloWorld!")
     inputInfo = self.results[dependList[0]].output
     
     #result 定义返回值,将被赋值给 self.results.HelloWorld, 其中script必须设置用以提交任务,output如果不设置则该APP不能被依赖
     result = bundle(output=bundle(),script=bundle()) 
     
     #extend program path (get abs path)
     self.HelloWorld.program = self.expath('HelloWorld.program')
     
     #script template  生成脚本,cmd是个列表,每个值生成shell脚本的一行,${XXX}将被ParamDict中的值替换
     cmd = []
     cmd.append('%s index ${PARAM} ${INPUT}' % self.HelloWorld.program)
     cmd.append('echo "Hello World!"')
         
     for sampleName in inputInfo:
         scriptsdir = impl.mkdir(self.scriptsDir,'standalone',sampleName)
         
         ParamDict = {
                 "INPUT": inputInfo[sampleName],
                 "PARAM":self.HelloWorld.parameter
             }
         
         #write script
         scriptPath = \
         impl.write_shell(
                 name = 'HelloWorld',
                 scriptsdir = scriptsdir,
                 commands=cmd,
                 paramDict=ParamDict)
         
         #result
         result.output[sampleName] = inputInfo[sampleName]
         result.script[sampleName] = scriptPath
     return result
Exemplo n.º 18
0
    def run(self, impl, dependList=None):
        impl.log.info("step: newCnv!")
        # depend bamQC
        result = bundle(script=bundle())

        multi_sample = self.option.multiSampleName
        scriptsdir = impl.mkdir(self.option.workdir, "scripts", 'standalone',
                                multi_sample)

        # extend program path
        self.newCnv.program = self.expath('newCnv.program')
        if self.file.has_key('newCnvConfig'):
            self.file.newCnvConfig = self.expath('file.newCnvConfig')
            self.newCnv.parameter += " %s" % self.file.newCnvConfig
        else:
            raise RuntimeError("newCnv Config file don't exists!")

        if self.file.has_key('cnvRegions'):
            self.file.cnvRegions = self.expath('file.cnvRegions')
        else:
            raise RuntimeError("file.cnvRegions don't exists!")

        poolingList = self.getPoolingList()
        if len(poolingList) == 0:
            raise RuntimeError("pooling info must be setted for CNV analysis!")

        cmd = []
        for pool in poolingList:
            cnvscriptsdir = impl.mkdir(self.option.workdir, "variation", 'cnv',
                                       pool)
            script = self.poolingScript(impl, cnvscriptsdir)
            self.writeSampleList(pool, cnvscriptsdir)
            cmd.append("cd %s" % cnvscriptsdir)
            cmd.append("sh %s >%s.o 2>%s.e" % (script, script, script))
            cmd.append(
                'if [ $? -ne 0 ]; then\n\techo "[WARNING]  %s - newCnv failed." >> %s'
                % (pool, self.logfile))
            cmd.append('\texit 1\nelse')
            cmd.append(
                '\techo "[INFO   ]  %s - newCnv complete." >> %s\nfi\n' %
                (pool, self.logfile))

        # write script
        scriptPath = \
            impl.write_shell(
                name='newCnv',
                scriptsdir=scriptsdir,
                commands=cmd
            )

        # result
        result.script[multi_sample] = scriptPath
        return result
Exemplo n.º 19
0
class rmdup(Workflow):
    """ rmdup """

    INIT = bundle(rmdup=bundle())
    INIT.rmdup.program = "GaeaDuplicateMarker.jar"
    INIT.rmdup.parameter_SE = ' -S '
    INIT.rmdup.parameter = ''

    def run(self, impl,dependList):
        impl.log.info("step: rmdup!")
        inputInfo = self.results[dependList[0]].output
        result = bundle(output=bundle(),script=bundle())
        
        #extend program path
        self.rmdup.program = self.expath('rmdup.program')
        
        if self.init.get('isSE'):
            self.rmdup.parameter = self.rmdup.parameter_SE
            
        #script template    
        fs_cmd = self.fs_cmd
        cmd = []
        cmd.append("%s ${OUTDIR}/" % fs_cmd.delete )
        cmd.append("%s ${INPUT}/*/_SUCCESS ${INPUT}/*/_logs" % fs_cmd.delete )
        cmd.append("${PROGRAM} -I ${INPUT} -O ${OUTDIR} -i 1 -R ${REDUCERNUM} ${PARAM}")
            
        for sampleName in inputInfo:
            scriptsdir = impl.mkdir(self.gaeaScriptsDir,sampleName)
            hdfs_outputPath = os.path.join(self.option.dirHDFS,sampleName,'rmdup_output')
            
            #global param
            ParamDict = {
                    "PROGRAM": "%s jar %s" % (self.hadoop.bin, self.rmdup.program),
                    "INPUT": inputInfo[sampleName],
                    "OUTDIR": hdfs_outputPath,
                    "REDUCERNUM":self.hadoop.reducer_num,
                    "PARAM":self.rmdup.parameter
                }
            
            #write script
            scriptPath = \
            impl.write_shell(
                    name = 'rmdup',
                    scriptsdir = scriptsdir,
                    commands=cmd,
                    paramDict=ParamDict)
            
            #result
            result.output[sampleName] = os.path.join(hdfs_outputPath,'Mark')
            result.script[sampleName] = scriptPath
        return result
                                
Exemplo n.º 20
0
 def run(self, impl, dependList):
     impl.log.info("step: cnv!")
     # depend bamQC
     inputInfo = self.results[dependList[0]].output
     result = bundle(script=bundle())
     
     multi_sample = self.option.multiSampleName
     scriptsdir = impl.mkdir(self.option.workdir,"scripts",'standalone',multi_sample)
     
     #extend program path
     self.cnv.program = self.expath('cnv.program')
             
     temp = impl.mkdir(self.option.workdir,'temp') 
     annolist = os.path.join(temp,'anno_depth.list')
     with open(annolist,'w') as f:
         if self.option.multiSample:
             for sample in self.sample:
                 anno_region = os.path.join(inputInfo[multi_sample],'%s.anno_region.txt' % sample)
                 line = "%s\t%s\n" % (sample,anno_region)
                 f.write(line)
         else:
             for sampleName in inputInfo:
                 anno_region = os.path.join(inputInfo[sampleName],'%s.anno_region.txt' % sampleName)
                 line = "%s\t%s\n" % (sampleName,anno_region)
                 f.write(line)
                 
     _,output =  commands.getstatusoutput('perl %s/bin/require_config.pl %s' % (self.GAEA_HOME,self.file.annoProtoclConfig))
     config = eval(output)
     self.cnv.parameter += ' -trans %s' % config['trans']   
     
     #global param
     ParamDict = {
             "PROGRAM": "perl %s" % self.cnv.program,
             "OUTPUT" : impl.mkdir(self.option.workdir,'variation','cnv'),
             "ANNOLIST":annolist,
             "SAMPLELIST": self.option.sampleList
         }
     
     #script template    
     cmd = ["${PROGRAM} -output ${OUTPUT} -QC ${ANNOLIST} -samplelist ${SAMPLELIST}  %s" %self.cnv.parameter]
 
     #write script
     scriptPath = \
     impl.write_shell(
             name = 'cnv',
             scriptsdir = scriptsdir,
             commands=cmd,
             paramDict=ParamDict)
 
     #result
     result.script[multi_sample] = scriptPath    
     return result
Exemplo n.º 21
0
    def run(self, impl, dependList):
        impl.log.info("step: BQSRSpark!")
        inputInfo = self.results[dependList[0]].output
        result = bundle(output=bundle(), script=bundle())

        # extend program path
        self.BQSRSpark.program = self.expath('BQSRSpark.program')
        spark_param = self.BQSRSpark.parameter_spark
        if self.hadoop.get('queue'):
            spark_param = impl.paramCheck(True, spark_param, '--queue',
                                          self.hadoop.queue)

        # script template
        fs_cmd = self.fs_cmd
        cmd = []
        cmd.append("%s ${OUTPUT}" % fs_cmd.delete)
        cmd.append("%s ${INPUT}/*/_SUCCESS ${INPUT}/*/_logs" % fs_cmd.delete)
        cmd.append(
            "${PROGRAM} BQSRPipelineSpark -I ${INPUT} -O ${OUTPUT} -R ${REF} %s -- %s"
            % (self.BQSRSpark.parameter, spark_param))

        # global param
        ParamDict = self.file.copy()
        ParamDict.update({
            "PROGRAM": self.BQSRSpark.program,
            "REF": self.ref.normal.ref
        })

        JobParamList = []
        for sampleName in inputInfo:
            scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName)
            hdfs_outputPath = os.path.join(self.option.dirHDFS, sampleName,
                                           'BQSRSpark_output')
            result.output[sampleName] = hdfs_outputPath

            # global param
            JobParamList.append({
                "SAMPLE": sampleName,
                "SCRDIR": scriptsdir,
                "INPUT": inputInfo[sampleName],
                "OUTPUT": hdfs_outputPath,
            })

        scriptPath = \
            impl.write_scripts(
                name='BQSRSpark',
                commands=cmd,
                JobParamList=JobParamList,
                paramDict=ParamDict)

        result.script.update(scriptPath)
        return result
Exemplo n.º 22
0
    def run(self, impl, dependList):
        impl.log.info("step: baserecal_spark!")
        inputInfo = self.results[dependList[0]].output
        result = bundle(output=bundle(),script=bundle())
        
        #extend program path
        self.baserecal_spark.bqsr = self.expath('baserecal_spark.bqsr')

        if self.option.multiSample:
            self.baserecal_spark.parameter += " -MutiSample "
            
        #global param
        ParamDict = self.file.copy()
        ParamDict.update({
                "PROGRAM_BQSR": "spark-submit --master yarn --num-executors 192 --executor-cores 1 --executor-memory 6g %s -n 2000" % self.baserecal_spark.bqsr,
                "REF": "file://%s" % self.ref.normal.gaeaIndex
            })
        
        #script template    
        fs_cmd = self.fs_cmd
        cmd = []
        cmd.append("%s ${INPUT}/_*" % fs_cmd.delete )
        cmd.append("%s ${OUTDIR_BQSR}" % fs_cmd.delete)
        cmd.append("${PROGRAM_BQSR} -i ${INPUT} -o ${OUTDIR_BQSR} --ref ${REF} %s" %self.baserecal_spark.parameter)

        JobParamList = []
        for sampleName in inputInfo:
            scriptsdir = impl.mkdir(self.gaeaScriptsDir,sampleName)
            hdfs_outputPath = os.path.join(self.option.dirHDFS,sampleName,'baserecal_spark_output')
            
            #global param
            JobParamList.append({
                    "SAMPLE" : sampleName,
                    "SCRDIR" : scriptsdir,
                    "INPUT": inputInfo[sampleName],
                    "OUTDIR_BQSR": hdfs_outputPath
                })
            
            result.output[sampleName] = hdfs_outputPath
            
        #write script
        scriptPath = \
        impl.write_scripts(
                name = 'baserecal_spark',
                commands=cmd,
                JobParamList=JobParamList,
                paramDict=ParamDict)
    
        #result
        result.script.update(scriptPath) 
        return result
Exemplo n.º 23
0
    def run(self, impl, dependList):
        impl.log.info("step: merge_vcf!")
        inputInfo = self.results[dependList[0]].output
        result = bundle(output=bundle(), script=bundle())

        # extend program path
        self.merge_vcf.program = self.expath('merge_vcf.program')
        self.merge_vcf.bcftools = self.expath('merge_vcf.bcftools', False)

        # global param
        hadoop_parameter = ''
        if self.hadoop.get('queue'):
            hadoop_parameter += ' -D mapreduce.job.queuename={} '.format(self.hadoop.queue)

        ParamDict = {
            "PROGRAM": "%s jar %s" % (self.hadoop.bin, self.merge_vcf.program),
            "HADOOPPARAM": hadoop_parameter
        }

        JobParamList = []
        for sampleName in inputInfo:
            scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName)
            outputPath = impl.mkdir(self.option.workdir, "variation", sampleName)
            result.output[sampleName] = os.path.join(outputPath, "{}.hc.vcf.gz".format(sampleName))

            # global param
            JobParamList.append({
                "SAMPLE": sampleName,
                "SCRDIR": scriptsdir,
                "INPUT": inputInfo[sampleName],
                "VCF": result.output[sampleName]
            })

        cmd = ["%s ${INPUT}/_*" % self.fs_cmd.delete,
               '${PROGRAM} SortVcf ${HADOOPPARAM} -input ${INPUT} -output file://${VCF}\n']

        if self.merge_vcf.bcftools:
            cmd.append("%s index %s ${VCF}" % (self.merge_vcf.bcftools, self.merge_vcf.bcftools_param))

        # write script
        scriptPath = \
            impl.write_scripts(
                name='merge_vcf',
                commands=cmd,
                JobParamList=JobParamList,
                paramDict=ParamDict)

        # result
        result.script.update(scriptPath)
        return result
Exemplo n.º 24
0
class spark_rmdup(Workflow):
    """ spark_rmdup """

    INIT = bundle(spark_rmdup=bundle())
    INIT.spark_rmdup.program = "/ifs4/ISDC_BD/huangzhibo/test/testSpark/20160623/DuplicationMark.jar"
    INIT.spark_rmdup.parameter = ' -i 1 '

    def run(self, impl, dependList):
        impl.log.info("step: spark_rmdup!")
        inputInfo = self.results[dependList[0]].output
        result = bundle(output=bundle(), script=bundle())

        #extend program path
        self.spark_rmdup.program = impl.expath(self.Path.prgDir,
                                               self.spark_rmdup.program)

        #script template
        fs_cmd = self.fs_cmd
        cmd = []
        cmd.append("%s ${OUTDIR}/" % fs_cmd.delete)
        cmd.append("%s ${INPUT}/*/_SUCCESS ${INPUT}/*/_logs" % fs_cmd.delete)
        cmd.append(
            "spark-submit --class org.bgi.flexlab.gaea.spark.example.DuplicationMark --master yarn --num-executors 48 --driver-memory 8g --executor-memory 25g --executor-cores 4 --queue spark_queue ${PROGRAM} -I ${INPUT} -O ${OUTDIR} ${PARAM}"
        )

        for sampleName in inputInfo:
            scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName)
            hdfs_outputPath = os.path.join(self.option.dirHDFS, sampleName,
                                           'spark_rmdup_output')

            #global param
            ParamDict = {
                "PROGRAM": self.spark_rmdup.program,
                "INPUT": inputInfo[sampleName],
                "OUTDIR": hdfs_outputPath,
                "PARAM": self.spark_rmdup.parameter
            }

            #write script
            scriptPath = \
            impl.write_shell(
                    name = 'spark_rmdup',
                    scriptsdir = scriptsdir,
                    commands=cmd,
                    paramDict=ParamDict)

            #result
            result.output[sampleName] = os.path.join(hdfs_outputPath, 'Mark')
            result.script[sampleName] = scriptPath
        return result
Exemplo n.º 25
0
def parse_sample(sampleList):
    sample_lane_counter = 0
    total_number = 0

    with open(sampleList,'r') as sampleFile:
        sampleInfo = bundle()
        gender = 'male'
        thetype = ''
        pool = ''
        for line in sampleFile:
            line = line.strip()
            #sample_name, gender, family, type, pool, fq1, fq2, insert_size, fq1s
            field = line.split()
            field_num = len(field)
            sample_name = field[0]
            family = field[0]
            fq1 = field[1]
            fq2 = field[2]

            tmp = os.path.basename(fq1).split("_")
            rg_ID = tmp[0]
            rg_PU = total_number
            rg_LB = total_number
            rg = "@RG\\tID:%s\\tPL:illumina\\tPU:%s\\tLB:%s\\tSM:%s\\tCN:BGI" % (rg_ID,rg_PU,rg_LB,sample_name)
            fq_lib_name = rg_ID
            total_number += 1

            if sample_name not in sampleInfo:
                sampleInfo[sample_name] = bundle()
                sample_lane_counter = 0
            else:
                sample_lane_counter = len(sampleInfo[sample_name])

            dataTag = 'data'+str(sample_lane_counter)
            if dataTag not in sampleInfo[sample_name]:
                sampleInfo[sample_name][dataTag] = bundle()

            sampleInfo[sample_name][dataTag]['fq1'] = fq1
            sampleInfo[sample_name][dataTag]['fq2'] = fq2
            sampleInfo[sample_name][dataTag]['adp1'] = 'null'
            sampleInfo[sample_name][dataTag]['adp2'] = 'null'
            sampleInfo[sample_name][dataTag]['gender'] = gender
            sampleInfo[sample_name][dataTag]['family'] = family
            sampleInfo[sample_name][dataTag]['type'] = thetype
            sampleInfo[sample_name][dataTag]['pool'] = pool
            sampleInfo[sample_name][dataTag]['rg'] = rg
            sampleInfo[sample_name][dataTag]['libname'] = fq_lib_name

    return sampleInfo
Exemplo n.º 26
0
    def run(self, impl, dependList):
        impl.log.info("step: BQSRSpark!")
        inputInfo = self.results[dependList[0]].output
        result = bundle(output=bundle(), script=bundle())

        # extend program path
        self.BQSRSpark.program = self.expath('BQSRSpark.program')
        spark_param = self.BQSRSpark.parameter_spark
        if self.hadoop.get('queue'):
            spark_param = impl.paramCheck(True, spark_param, '--queue', self.hadoop.queue)

        # script template
        fs_cmd = self.fs_cmd
        cmd = []
        cmd.append("%s ${OUTPUT}" % fs_cmd.delete)
        cmd.append("%s ${INPUT}/*/_SUCCESS ${INPUT}/*/_logs" % fs_cmd.delete)
        cmd.append("${PROGRAM} BQSRPipelineSpark -I ${INPUT} -O ${OUTPUT} -R ${REF} %s -- %s" % (
        self.BQSRSpark.parameter, spark_param))

        # global param
        ParamDict = self.file.copy()
        ParamDict.update({
            "PROGRAM": self.BQSRSpark.program,
            "REF": self.ref.normal.ref
        })

        JobParamList = []
        for sampleName in inputInfo:
            scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName)
            hdfs_outputPath = os.path.join(self.option.dirHDFS, sampleName, 'BQSRSpark_output')
            result.output[sampleName] = hdfs_outputPath

            # global param
            JobParamList.append({
                "SAMPLE": sampleName,
                "SCRDIR": scriptsdir,
                "INPUT": inputInfo[sampleName],
                "OUTPUT": hdfs_outputPath,
            })

        scriptPath = \
            impl.write_scripts(
                name='BQSRSpark',
                commands=cmd,
                JobParamList=JobParamList,
                paramDict=ParamDict)

        result.script.update(scriptPath)
        return result
Exemplo n.º 27
0
 def run(self, impl, dependList):
     impl.log.info("step: evaSNP!")
     inputInfo = self.results[dependList[0]].output
     result = bundle(output=bundle(),script=bundle())
     
     #extend program path
     self.evaSNP.program = self.expath('evaSNP.program')
     
     if not self.evaSNP.chip_vcf:
         impl.log.error("No chipVCF value for evaSNP step. please set it.")
     
     #script template    
     cmd = []
     cmd.append("${PROGRAM} ${PARAM} -c ${REFVCF} -r ${VCF} ${DBSNP} -o ${OUTPUT}")
     cmd.append('if [ $? -ne 0 ]; then\n\techo "[WARNING]  ${SAMPLE} - evaSNP failed." >> %s\n\texit 1\nelse' %self.logfile)
     cmd.append('\techo "[INFO   ]  ${SAMPLE} - evaSNP complete." >> %s\n\texit 1\nfi' % self.logfile)
     
     dbsnp = ''
     if self.evaSNP.dbsnp:
         dbsnp = "-d %s" % self.evaSNP.dbsnp
         
     for sampleName in inputInfo:
         scriptsdir = impl.mkdir(self.option.workdir,"scripts",'standalone',sampleName)
         vcf = inputInfo[sampleName]
         outdir = impl.mkdir(self.option.workdir,"QC",'evaVCF',sampleName)
         output = os.path.join(outdir,'evaSNP.txt')
         ParamDict = {
                 "SAMPLE":sampleName,
                 "PROGRAM": "perl %s" % self.evaSNP.program,
                 "REFVCF":self.evaSNP.chip_vcf,
                 "VCF": vcf,
                 "DBSNP" :dbsnp,
                 "OUTPUT": output,
                 "PARAM":self.evaSNP.parameter
             }
         
         #write script
         scriptPath = \
         impl.write_shell(
                 name = 'evaSNP',
                 scriptsdir = scriptsdir,
                 commands=cmd,
                 paramDict=ParamDict)
             
         #result
         result.output[sampleName] = output
         result.script[sampleName] = scriptPath
     
     return result
Exemplo n.º 28
0
    def run(self, impl, dependList):
        impl.log.info("step: BwaMarkDupSpark!")
        inputInfo = self.results[dependList[0]].output
        result = bundle(output=bundle(), script=bundle())

        # extend program path
        self.BwaMarkDupSpark.program = self.expath('BwaMarkDupSpark.program')

        spark_param = self.BwaMarkDupSpark.parameter_spark
        if self.hadoop.get('queue'):
            spark_param = impl.paramCheck(True, spark_param, '--queue',
                                          self.hadoop.queue)

        # script template
        fs_cmd = self.fs_cmd
        cmd = []
        cmd.append("%s ${OUTPUT}" % fs_cmd.delete)
        cmd.append("%s ${INPUT}/*/_SUCCESS ${INPUT}/*/_logs" % fs_cmd.delete)
        cmd.append(
            "${PROGRAM} BwaAndMarkDuplicatesPipelineSpark -I ${INPUT} -O ${OUTPUT} -R ${REF} ${PARAM} -- ${PARAMSPARK}"
        )

        for sampleName in inputInfo:
            scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName)
            hdfs_outputPath = os.path.join(self.option.dirHDFS, sampleName,
                                           'BwaMarkDupSpark_output')

            # global param
            ParamDict = {
                "PROGRAM": self.BwaMarkDupSpark.program,
                "INPUT": inputInfo[sampleName],
                "OUTPUT": hdfs_outputPath,
                "REF": self.ref.normal.ref,
                "PARAM": self.BwaMarkDupSpark.parameter,
                "PARAMSPARK": spark_param
            }

            # write script
            scriptPath = \
                impl.write_shell(
                    name='BwaMarkDupSpark',
                    scriptsdir=scriptsdir,
                    commands=cmd,
                    paramDict=ParamDict)

            # result
            result.output[sampleName] = hdfs_outputPath
            result.script[sampleName] = scriptPath
        return result
Exemplo n.º 29
0
    def write_scripts(self, name, commands, JobParamList=[], paramDict={}):
        scriptDict = bundle()
        t = _generate_template(commands)
        if paramDict:
            t = Template(t.safe_substitute(paramDict))

        for d in JobParamList:
            scriptsdir = d.get('SCRDIR')
            sampleName = d.get('SAMPLE')

            if not scriptsdir or not sampleName:
                self.log.error(
                    "Error in step (%s) JobParamList(no SMAPLE or SCRDIR)." %
                    name)
                exit(1)

            scriptDict[sampleName] = os.path.join(scriptsdir, name + '.sh')
            script = open(scriptDict[sampleName], 'w')

            print >> script, '#!/bin/bash'
            print >> script, "echo ==========start %s at : `date` ==========" % name
            _script_append(script, t, paramDict=d)
            print >> script, ""
            print >> script, "echo ==========end %s at : `date` ========== " % name
            script.close()

        return scriptDict
Exemplo n.º 30
0
    def step(self, name, depends=[], commands=[], hog=False, memory=None,
             arrayParamValues=None, workflowTasksPerGridTask=1, priority=0,
             resources={}, concurrencyLimit = None):
        self._checkname(name)
        depends = wraplist(depends, 'depends')
        for dep in depends:
            if _findstep(dep, self.steps) is None:
                raise RuntimeError('step named %s does not exist' % dep)
        commands = wraplist(commands, 'commands')
        for c in commands:
            if c.strip()[-1] == '&':
                raise RuntimeError('commands must not end with "&"')

        if arrayParamValues is None:
            arrayParamValues = []
        elif 0 == len(arrayParamValues):
            raise RuntimeError('arrayParamValues has 0 elements')

        # Validate and set resource defaults.
        resources = resources.copy() # shallow copy
        if (memory is not None) and resources.has_key('memorymb'):
            raise RuntimeError('both memory and memorymb resource cannot be specified in a single step')
        if memory is not None:
            resources['memorymb'] = int(memory*1024)
        if 'cpu' not in resources:
            resources['cpu'] = 100
        if 'memorymb' not in resources:
            resources['memorymb'] = 1024
        memory = float(resources['memorymb'] / 1024)
        self.steps.append(bundle(name=name, depends=depends, commands=commands,
                                 hog=hog, memory=memory, resources=resources, priority=priority,
                                 arrayParamValues=arrayParamValues,
                                 workflowTasksPerGridTask=workflowTasksPerGridTask,
                                 concurrencyLimit=concurrencyLimit))
        return name
Exemplo n.º 31
0
 def write_file(self,fileName,scriptsdir,commands,JobParamList=None,paramDict={},addShellHeader=False):
     scriptDict = bundle()
     scriptDict.script = []
     
     t = _generate_template(commands)
     m = re.match('.*\$\{(\S+)\}.*',fileName)
     
     if JobParamList and m:
         for d in JobParamList:
             if not d.has_key(m.group(1)):
                 self.log.error("Wrong about impl.write_file paramter: fileName. No %s in JobParamList." % m.group(1))
             if paramDict:
                 d.update(paramDict)
             file_name = _generate_template(fileName).safe_substitute(d)
             scriptFile = os.path.join(scriptsdir,file_name)
             scriptDict["script"].append(scriptFile)
             script = open(scriptFile, 'w')
             print >>script, t.safe_substitute(d)
     else:
         scriptFile = os.path.join(scriptsdir,fileName)
         scriptDict["script"].append(scriptFile)
         script = open(scriptFile, 'w')
         if addShellHeader:
             print >>script, '#!/bin/bash'
             print >>script, "echo ==========start %s at : `date` ==========" % os.path.splitext(fileName)[0] 
             _script_append(script, t, JobParamList, paramDict)
             print >>script, "echo ==========end %s at : `date` ========== " % os.path.splitext(fileName)[0] 
         else:   
             _script_append(script, t, JobParamList, paramDict)
         script.close()
     return scriptDict
Exemplo n.º 32
0
 def write_scripts(self, name, commands, JobParamList=[], paramDict={}):
     scriptDict = bundle()
     t = _generate_template(commands)
     
     
     for d in JobParamList:
         scriptsdir = d.get('SCRDIR')
         sampleName = d.get('SAMPLE')
         
         if not scriptsdir or not sampleName:
             self.log.error("Error in step (%s) JobParamList(no SMAPLE or SCRDIR)." % name) 
             exit(1)
         
         if paramDict:
             d.update(paramDict)
             
         scriptDict[sampleName] = os.path.join(scriptsdir,name+'.sh')   
         script = open(scriptDict[sampleName], 'w')
         
         print >>script, '#!/bin/bash'
         print >>script, "echo ==========start %s at : `date` ==========" % name
         _script_append(script, t, paramDict=d)
         print >>script, ""  
         print >>script, "echo ==========end %s at : `date` ========== " % name
         script.close()
         
     return scriptDict
Exemplo n.º 33
0
 def write_Scripts(self, name, scriptsdir, commands, JobParamList=[], paramDict={}, reducer=True):
     scriptDict = bundle()
     t = _generate_template(commands)
     
     scriptDict["script"] = []
     for n, d in enumerate(JobParamList):
         if paramDict:
             d.update(paramDict)
         dataTag = str(n)
         if d.get('DATATAG'):
             dataTag = d.get('DATATAG')
         scriptFile = os.path.join(scriptsdir,name+'_'+  dataTag  +'.sh')
         scriptDict["script"].append(scriptFile)
         script = open(scriptFile, 'w')
         if reducer:
             print >>script, t.safe_substitute(d)
         else:
             print >>script, '#!/bin/bash'
             print >>script, "echo ==========start %s at : `date` ========== %s" % name
             print >>script, t.safe_substitute(d)
 #                 print >>script, "\n"          
             print >>script, "echo ==========end %s at : `date` ========== %s" % name
         script.close()
             
     return scriptDict
Exemplo n.º 34
0
    def pycommand(self, func, *args, **kwargs):
        '''Creates the command to run the given python function, passing it the given arguments.

        The function (func) must be picklable by the python pickle
        package. Basically, this means it must be a global function
        within a module. The arguments must be JSON-serializable. The
        func is required to accept the arrayParamValues element for
        this task as its first argument (always a dictionary object),
        and any additional parameters must be passed to the pycommand
        function. For non-array steps, the arrayParamValues element is
        still passed, but is a dictionary with no keys.'''

        funcString = self._escapetriplequote(pickle.dumps(func))
        argBundle = bundle(args=args, kwargs=kwargs)
        argString = self._escapetriplequote(json.dumps(argBundle, indent=2))

        lines = [
            '#! /usr/bin/env python',
            'import sys',
            'sys.path = %s' % str(sys.path),
            'import os, pickle, sys, wfclib.jsonutil',
            'from cgiutils import bundle',
            'func = pickle.loads("""%s""")' % funcString,
            'argBundle = wfclib.jsonutil.loads("""%s""")' % argString,
            'apv = bundle()',
            'apvKeys = os.getenv("CGI_ARRAY_PARAM_NAMES")',
            'if apvKeys is not None:',
            '  for key in apvKeys.split(":"):',
            '    apv[key] = os.getenv(key)',
            'func(apv, *argBundle.args, **argBundle.kwargs)',
        ]
        script = '\n'.join(lines)

        return "python -u - <<'@CGI_PYCOMMAND_HERE_DOC_DELIM'\n%s\n@CGI_PYCOMMAND_HERE_DOC_DELIM" % script
Exemplo n.º 35
0
 def write_file(self,fileName,scriptsdir,commands,JobParamList=None,paramDict={},addShellHeader=False):
     scriptDict = bundle()
     scriptDict.script = []
     
     t = _generate_template(commands)
     m = re.match('.*\$\{(\S+)\}.*',fileName)
     
     if JobParamList and m:
         for d in JobParamList:
             if not d.has_key(m.group(1)):
                 self.log.error("Wrong about impl.write_file paramter: fileName. No %s in JobParamList." % m.group(1))
             if paramDict:
                 d.update(paramDict)
             file_name = _generate_template(fileName).safe_substitute(d)
             scriptFile = os.path.join(scriptsdir,file_name)
             scriptDict["script"].append(scriptFile)
             script = open(scriptFile, 'w')
             print >>script, t.safe_substitute(d)
     else:
         scriptFile = os.path.join(scriptsdir,fileName)
         scriptDict["script"].append(scriptFile)
         script = open(scriptFile, 'w')
         if addShellHeader:
             print >>script, '#!/bin/bash'
             print >>script, "echo ==========start %s at : `date` ==========" % os.path.splitext(fileName)[0] 
             _script_append(script, t, JobParamList, paramDict)
             print >>script, "echo ==========end %s at : `date` ========== " % os.path.splitext(fileName)[0] 
         else:   
             _script_append(script, t, JobParamList, paramDict)
         script.close()
     return scriptDict
Exemplo n.º 36
0
    def write_Scripts(self,
                      name,
                      scriptsdir,
                      commands,
                      JobParamList=[],
                      paramDict={},
                      reducer=True):
        scriptDict = bundle()
        t = _generate_template(commands)

        scriptDict["script"] = []
        for n, d in enumerate(JobParamList):
            if paramDict:
                d.update(paramDict)
            dataTag = str(n)
            if d.get('DATATAG'):
                dataTag = d.get('DATATAG')
            scriptFile = os.path.join(scriptsdir, name + '_' + dataTag + '.sh')
            scriptDict["script"].append(scriptFile)
            script = open(scriptFile, 'w')
            if reducer:
                print >> script, t.safe_substitute(d)
            else:
                print >> script, '#!/bin/bash'
                print >> script, "echo ==========start %s at : `date` ========== %s" % name
                print >> script, t.safe_substitute(d)
                #                 print >>script, "\n"
                print >> script, "echo ==========end %s at : `date` ========== %s" % name
            script.close()

        return scriptDict
Exemplo n.º 37
0
    def pycommand(self, func, *args, **kwargs):
        '''Creates the command to run the given python function, passing it the given arguments.

        The function (func) must be picklable by the python pickle
        package. Basically, this means it must be a global function
        within a module. The arguments must be JSON-serializable. The
        func is required to accept the arrayParamValues element for
        this task as its first argument (always a dictionary object),
        and any additional parameters must be passed to the pycommand
        function. For non-array steps, the arrayParamValues element is
        still passed, but is a dictionary with no keys.'''

        funcString = self._escapetriplequote(pickle.dumps(func))
        argBundle = bundle(args=args, kwargs=kwargs)
        argString = self._escapetriplequote(json.dumps(argBundle,indent=2))

        lines = [ '#! /usr/bin/env python',
                  'import sys',
                  'sys.path = %s' % str(sys.path),
                  'import os, pickle, sys, wfclib.jsonutil',
                  'from cgiutils import bundle',
                  'func = pickle.loads("""%s""")' % funcString,
                  'argBundle = wfclib.jsonutil.loads("""%s""")' % argString,
                  'apv = bundle()',
                  'apvKeys = os.getenv("CGI_ARRAY_PARAM_NAMES")',
                  'if apvKeys is not None:',
                  '  for key in apvKeys.split(":"):',
                  '    apv[key] = os.getenv(key)',
                  'func(apv, *argBundle.args, **argBundle.kwargs)',
                  ]
        script = '\n'.join(lines)

        return "python -u - <<'@CGI_PYCOMMAND_HERE_DOC_DELIM'\n%s\n@CGI_PYCOMMAND_HERE_DOC_DELIM" % script
Exemplo n.º 38
0
    def run(self, impl, dependList):
        impl.log.info("step: ubammerge!")
        inputInfo = self.results[dependList[0]].output
        result = bundle(output=bundle(), script=bundle())
        sampleName = self.option.multiSampleName
        merge_tmp = impl.mkdir(self.option.workdir, "temp", sampleName, 'ubammerge.bam')

        # extend program path
        self.ubammerge.program = self.expath('ubammerge.program')

        # script template
        fs_cmd = self.fs_cmd
        cmd = []
        # cmd.append("%s ${OUTDIR}/" % fs_cmd.delete)
        # cmd.append("%s ${INPUT}/*/_SUCCESS ${INPUT}/*/_logs" % fs_cmd.delete)

        scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName)
        hdfs_outputPath = os.path.join(self.option.dirHDFS, sampleName, 'ubammerge_output')
        bams = []
        for sample_name in inputInfo:
            sample_input = inputInfo[sample_name]
            for dataTag in sample_input:
                bams.append(sample_input[dataTag]['bam'])

        if len(bams) <= 1:
            merge_tmp = bams[0]
        else:
            input_bam_command = ''
            for input_bam in bams:
                input_bam_command += "--UNMAPPED_BAM {} ".format(input_bam)
            cmd.append('%s MergeBamAlignment %s -O %s -R %s' % (self.ubammerge.program, input_bam_command, merge_tmp, self.ref.normal.ref))
        cmd.append('%s fs -mkdir -p %s' % (self.hadoop.bin, hdfs_outputPath))
        cmd.append('%s fs -put %s %s' % (self.hadoop.bin, merge_tmp, hdfs_outputPath))

        # write script
        scriptPath = \
            impl.write_shell(
                name='ubammerge',
                scriptsdir=scriptsdir,
                commands=cmd,
                paramDict=[])

        # result
        result.output[sampleName] = hdfs_outputPath
        result.script[sampleName] = scriptPath

        return result
Exemplo n.º 39
0
 def service(self, name, serviceCommand, serviceData, concurrencyLimit=None, hog=False):
     if concurrencyLimit is None:
         concurrencyLimit = 1200
     self._checkname(name)
     self.services.append(bundle(name=name, serviceCommand=serviceCommand,
                                 serviceData=serviceData,
                                 concurrencyLimit=concurrencyLimit, hog=hog))
     return name
Exemplo n.º 40
0
 def run(self, impl,dependList):
     impl.log.info("step: rmdup!")
     inputInfo = self.results[dependList[0]].output
     result = bundle(output=bundle(),script=bundle())
     
     #extend program path
     self.rmdup.program = self.expath('rmdup.program')
     
     if self.init.get('isSE'):
         self.rmdup.parameter = self.rmdup.parameter_SE
     hadoop_parameter = ''
     if self.hadoop.get('queue'):
         hadoop_parameter += '-D mapreduce.job.queuename={}'.format(self.hadoop.queue)
         
     #script template    
     fs_cmd = self.fs_cmd
     cmd = []
     cmd.append("%s ${OUTDIR}/" % fs_cmd.delete )
     cmd.append("%s ${INPUT}/*/_SUCCESS ${INPUT}/*/_logs" % fs_cmd.delete )
     cmd.append("${PROGRAM} %s -i ${INPUT} -o ${OUTDIR} -R ${REDUCERNUM} ${PARAM}" % hadoop_parameter)
         
     for sampleName in inputInfo:
         scriptsdir = impl.mkdir(self.gaeaScriptsDir,sampleName)
         hdfs_outputPath = os.path.join(self.option.dirHDFS,sampleName,'rmdup_output')
         
         #global param
         ParamDict = {
                 "PROGRAM": "%s jar %s MarkDuplicate" % (self.hadoop.bin, self.rmdup.program),
                 "INPUT": inputInfo[sampleName],
                 "OUTDIR": hdfs_outputPath,
                 "REDUCERNUM":self.hadoop.reducer_num,
                 "PARAM":self.rmdup.parameter
             }
         
         #write script
         scriptPath = \
         impl.write_shell(
                 name = 'rmdup',
                 scriptsdir = scriptsdir,
                 commands=cmd,
                 paramDict=ParamDict)
         
         #result
         result.output[sampleName] = os.path.join(hdfs_outputPath,'Mark')
         result.script[sampleName] = scriptPath
     return result
Exemplo n.º 41
0
    def run(self, impl,dependList):
        impl.log.info("step: clean!")
        inputInfo = self.results[dependList[0]].output
        result = bundle(output=bundle(),script=bundle())
        
        #extend program path
        #self.clean.program = self.expath('clean.program')

        #script template    
        fs_cmd = self.fs_cmd
        cmd = []
        cmd.append("source %s/bin/activate" % self.GAEA_HOME)
        cmd.append("check.py -s %s/state.json -n ${SAMPLE} -t %s %s" % (self.stateDir, ','.join(dependList), self.init.check_state_param))
        cmd.append("if [ $? = 0 ];then")
        cmd.append("%s %s/${SAMPLE}" % (fs_cmd.delete, self.option.dirHDFS))
        if self.init.check_state_param:
            cmd.append("${CPVCF}")
        cmd.append("fi")
            
        JobParamList = []
        for sampleName in self.sample:
            scriptsdir = impl.mkdir(self.gaeaScriptsDir,sampleName)
            vcf = ''
            for step in dependList:
                vcf_tmp = self.results[step].output[sampleName]
                if os.path.basename(vcf_tmp).find('vcf') != -1:
                    vcf = vcf_tmp
                    break
            
            #global param
            JobParamList.append({
                    "SAMPLE" : sampleName,
                    "SCRDIR" : scriptsdir,
                    "CPVCF" : "cp %s /ldfssz1/ST_HEALTH/WGS/project/3000members_hg38/vcf/" % vcf if vcf else ''
                })
            
        #write script
        scriptPath = \
        impl.write_scripts(
                name = 'clean',
                commands=cmd,
                JobParamList=JobParamList)
    
        #result
        result.script.update(scriptPath)           
        return result
Exemplo n.º 42
0
    def run(self, impl, dependList):
        impl.log.info("step: BwaMarkDupSpark!")
        inputInfo = self.results[dependList[0]].output
        result = bundle(output=bundle(), script=bundle())

        # extend program path
        self.BwaMarkDupSpark.program = self.expath('BwaMarkDupSpark.program')

        spark_param = self.BwaMarkDupSpark.parameter_spark
        if self.hadoop.get('queue'):
            spark_param = impl.paramCheck(True, spark_param, '--queue', self.hadoop.queue)

        # script template
        fs_cmd = self.fs_cmd
        cmd = []
        cmd.append("%s ${OUTPUT}" % fs_cmd.delete)
        cmd.append("%s ${INPUT}/*/_SUCCESS ${INPUT}/*/_logs" % fs_cmd.delete)
        cmd.append("${PROGRAM} BwaAndMarkDuplicatesPipelineSpark -I ${INPUT} -O ${OUTPUT} -R ${REF} ${PARAM} -- ${PARAMSPARK}")

        for sampleName in inputInfo:
            scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName)
            hdfs_outputPath = os.path.join(self.option.dirHDFS, sampleName, 'BwaMarkDupSpark_output')

            # global param
            ParamDict = {
                "PROGRAM": self.BwaMarkDupSpark.program,
                "INPUT": inputInfo[sampleName],
                "OUTPUT": hdfs_outputPath,
                "REF": self.ref.normal.ref,
                "PARAM": self.BwaMarkDupSpark.parameter,
                "PARAMSPARK": spark_param
            }

            # write script
            scriptPath = \
                impl.write_shell(
                    name='BwaMarkDupSpark',
                    scriptsdir=scriptsdir,
                    commands=cmd,
                    paramDict=ParamDict)

            # result
            result.output[sampleName] = hdfs_outputPath
            result.script[sampleName] = scriptPath
        return result
Exemplo n.º 43
0
 def parse_userjson(self,jsonfile):
     userConf = bundle()
     with open(jsonfile, 'r') as uf:
         data = uf.read()
         try: 
             userConf = clean(json.loads(data))
             self.extendcfg(userConf)
         except Exception,e:  
             print Exception,"%s, "%self.config,e
Exemplo n.º 44
0
def run(args, state):
    analysisDict = state.analysisDict
    sampleName = args.sampleName
    logger = Logger(os.path.join(state.scriptsDir, 'log'), '1',
                    'gaeaJobMonitor', False).getlog()
    isComplete = bundle()

    all_done = True

    jobList = args.jobs.split(',')

    if jobList[0] == 'init':
        if not state.results['init'].get('script'):
            jobList = jobList[1:]

    for num, step in enumerate(jobList):
        if analysisDict[step].platform == 'S':
            continue

        n = state.analysisList.index(step)
        if state.analysisList[0] != 'init':
            n += 1

        script = state.results[step]['script'][sampleName]
        if num > 0:
            for depStep in analysisDict[step].depend:
                if not isComplete[depStep]:
                    isComplete[step] = False
                    break
        if isComplete.has_key(step) and isComplete[step] == False:
            logger.warning('%s - step %d: %s failed' % (sampleName, n, step))
            continue

        printtime('step: %s start...' % step)
        p = subprocess.Popen('sh %s' % script,
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
        isComplete[step] = check_log(p, script, sampleName, n, step)
        if isComplete[step] or step == 'alignment':
            if step == 'alignment':
                isComplete[step] = True
            printtime("step: %s complete" % step)
            logger.info('%s - step %d: %s complete' % (sampleName, n, step))
            out_fh = open(script + '.o', 'w')
            for line in p.stdout.readlines():
                print >> out_fh, line[:-1]
            p.wait()
        else:
            all_done = False
            printtime("%s failed" % step)
            logger.warning('%s - step %d: %s failed' % (sampleName, n, step))
            if p.returncode == None:
                p.kill()

    return all_done
Exemplo n.º 45
0
 def parse(self,user_config=''):
     configInfo = bundle()
     if user_config:
         if user_config.endswith('.json') or user_config.endswith('config.json'):
             configInfo = self.parse_userjson(user_config)
         else:
             try:
                 configInfo = self.parse_usercfg(user_config)
             except Exception,e:  
                 print Exception,"%s, "%user_config,e
Exemplo n.º 46
0
def run(args):
    binPath = os.path.split(os.path.realpath(__file__))[0]
    os.environ['GAEA_HOME'] = os.path.split(binPath)[0]
    createVar = locals()
    defaultConfig = os.path.join(os.environ['GAEA_HOME'],'config','default.json')
    usercfg = bundle()
    try: 
        usercfg = ParseConfig(defaultConfig).parse(args.config)
    except Exception,e:  
        raise RuntimeError("Parse UserConfig failed," + repr(e) + "\n")
Exemplo n.º 47
0
def getAnalysisDict(analysis_flow):
    graph = bundle(init=bundle())
    graph['init']['depend'] = []
    graph['init']['platform'] = 'H'
    for stepList in  analysis_flow:
        if not graph.has_key(stepList[1]):
            graph[stepList[1]] = bundle()
        
        graph[stepList[1]]['depS'] = False
        if len(stepList) == 2:
            graph[stepList[1]]['depend'] = ['init']
            graph[stepList[1]]['platform'] = stepList[0].upper()
        else:
            graph[stepList[1]]['depend'] = stepList[2].split(',')
            graph[stepList[1]]['platform'] = stepList[0].upper()
            for dep in graph[stepList[1]]['depend']:
                if graph[dep]['platform'].upper() == 'S':
                    graph[stepList[1]]['depS'] = True
                    
    return graph
Exemplo n.º 48
0
def run(args):
    binPath = os.path.split(os.path.realpath(__file__))[0]
    os.environ['GAEA_HOME'] = os.path.split(binPath)[0]
    createVar = locals()
    defaultConfig = os.path.join(os.environ['GAEA_HOME'], 'config',
                                 'default.json')
    usercfg = bundle()
    try:
        usercfg = ParseConfig(defaultConfig).parse(args.config)
    except Exception, e:
        raise RuntimeError("Parse UserConfig failed," + repr(e) + "\n")
Exemplo n.º 49
0
def check_ref_type(ref):
    if not ref.has_key('normal'):
        ref.normal = bundle()
    if not ref.has_key('male'):
        ref.male = bundle()
    if not ref.has_key('female'):
        ref.female = bundle()
    
    if ref.male.get('ref') and ref.female.get('ref'):
        logger.info('male.ref: %s, female:%s. use gender mode!' %(ref.male.ref,ref.female.ref) )
        ref['gender_mode'] = 'both'
        ref.normal.rupdate(ref.male)
            
    if ref.normal.get('ref') and ref.female.get('ref') and not ref.male.get('ref'):
#         logger.warning("male ref don't exists! use normal as male.")
        ref['gender_mode'] = 'both'
        ref.male.rupdate(ref.normal)
            
    if ref.normal.get('ref') and not ref.female.get('ref') and not ref.male.get('ref'):
#         logger.warning("male and female ref don't exists! use normal mode!")
        ref['gender_mode'] = 'normal'
Exemplo n.º 50
0
def run(args,state):
    analysisDict = state.analysisDict
    sampleName = args.sampleName
    logger = Logger(os.path.join(state.scriptsDir,'log'),'1','gaeaJobMonitor',False).getlog()
    isComplete = bundle()
    
    all_done = True    

    jobList = args.jobs.split(',')
    
    if jobList[0] == 'init':
        if not state.results['init'].get('script'):
            jobList = jobList[1:]
    
    for num,step in enumerate(jobList):
        if analysisDict[step].platform == 'S':
            continue
        
        n = state.analysisList.index(step)
        if state.analysisList[0] != 'init':
            n += 1
        
        script = state.results[step]['script'][sampleName]
        if num > 0:
            for depStep in analysisDict[step].depend:
                if not isComplete[depStep]:
                    isComplete[step] = False
                    break
        if isComplete.has_key(step) and isComplete[step] == False:
            logger.warning('%s - step %d: %s failed' % (sampleName, n, step))
            continue
        
        printtime('step: %s start...' % step)
        p = subprocess.Popen('sh %s' % script, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        isComplete[step] = check_log(p,script,sampleName,n, step)
        if isComplete[step] or step == 'alignment':
            if step == 'alignment':
                isComplete[step] = True
            printtime("step: %s complete" % step)
            logger.info('%s - step %d: %s complete' % (sampleName, n, step))
            out_fh = open(script+'.o', 'w')
            for line in p.stdout.readlines():    
                print >>out_fh, line[:-1]
            p.wait()
        else:
            all_done = False    
            printtime("%s failed" % step)
            logger.warning('%s - step %d: %s failed' % (sampleName, n, step))
            if p.returncode == None:
                p.kill()
                
    return all_done
Exemplo n.º 51
0
 def write_shell(self, name, scriptsdir, commands, JobParamList=[], paramDict={}):
     scriptDict = bundle()
     t = _generate_template(commands)
     
     scriptFile = os.path.join(scriptsdir,name+'.sh')   
     script = open(scriptFile, 'w')
     print >>script, '#!/bin/bash'
     print >>script, "echo ==========start %s at : `date` ==========" % name
     _script_append(script, t, JobParamList, paramDict)
     print >>script, ""  
     print >>script, "echo ==========end %s at : `date` ========== " % name
     script.close()
     return scriptFile
Exemplo n.º 52
0
 def run(self, impl,dependList):
     impl.log.info("step: rmdup_spark!")
     inputInfo = self.results[dependList[0]].output
     result = bundle(output=bundle(),script=bundle())
     
     #extend program path
     self.rmdup_spark.program = self.expath('rmdup_spark.program')
     
     #script template    
     fs_cmd = self.fs_cmd
     cmd = []
     cmd.append("%s ${OUTDIR}/" % fs_cmd.delete )
     cmd.append("%s ${INPUT}/*/_SUCCESS ${INPUT}/*/_logs" % fs_cmd.delete )
     cmd.append("spark-submit --class org.bgi.flexlab.gaea.spark.example.DuplicationMark --master yarn --num-executors 48 --driver-memory 8g --executor-memory 25g --executor-cores 4 --queue spark_queue ${PROGRAM} -I ${INPUT} -O ${OUTDIR} ${PARAM}")
         
     for sampleName in inputInfo:
         scriptsdir = impl.mkdir(self.gaeaScriptsDir,sampleName)
         hdfs_outputPath = os.path.join(self.option.dirHDFS,sampleName,'rmdup_spark_output')
         
         #global param
         ParamDict = {
                 "PROGRAM":self.rmdup_spark.program,
                 "INPUT": inputInfo[sampleName],
                 "OUTDIR": hdfs_outputPath,
                 "PARAM":self.rmdup_spark.parameter
             }
         
         #write script
         scriptPath = \
         impl.write_shell(
                 name = 'rmdup_spark',
                 scriptsdir = scriptsdir,
                 commands=cmd,
                 paramDict=ParamDict)
         
         #result
         result.output[sampleName] = os.path.join(hdfs_outputPath,'Mark')
         result.script[sampleName] = scriptPath
     return result
Exemplo n.º 53
0
def bundle_rcopy(cfg):
    newdict = bundle()
    for entry in cfg:
        this_entry = cfg[entry]
        if isinstance(this_entry, dict):
            this_entry = bundle_rcopy(this_entry)
        elif isinstance(this_entry, list): # create a copy rather than a reference
            this_entry = list(this_entry)
        elif isinstance(this_entry, tuple): # create a copy rather than a reference
            this_entry = tuple(this_entry)
        newdict[entry] = this_entry
    
    return newdict