예제 #1
0
    def new_tasks(self, extra):
        """
		For each line of the input .csv file generate
		an execution Task
		"""
        tasks = []
        l = 0
        for parameter in self._enumerate_csv(self.params.csv_input_file):
            parameter_str = '.'.join(str(x) for x in parameter)
            parlength = len(parameter)
            if not parlength == 11:
                raise gc3libs.exceptions.InvalidUsage(
                    "Parameter length not correct")
            l = l + 1
            run = l
            jobname = "run%s" % str(l)
            extra_args = extra.copy()
            extra_args['jobname'] = jobname

            #Everything in results folder on remote computer
            extra_args['output_dir'] = CLOUDNAME  #Not working
            #extra_args['output_dir'] = extra_args['output_dir'].replace('NAME', DEFAULT_REMOTE_OUTPUT_FOLDER) #save on local machine#
            extra_args['output_dir'] = "%s%s" % (extra_args['output_dir'],
                                                 jobname)

            tasks.append(
                MatlabApp(self.params.matlab_function, parameter,
                          self.params.matlab_source_folder, run, **extra_args))
        return [ParallelTaskCollection(tasks, **extra)]
예제 #2
0
    def new_tasks(self, extra):
        appextra = extra.copy()
        del appextra['output_dir']

        if self.params.parallel:
            task = ParallelTaskCollection([
                GRunApplication(self.params.args,
                                jobname='GRunApplication.%d' % i,
                                output_dir='GRunApplication.%d.d' % i,
                                **appextra)
                for i in range(self.params.parallel)
            ], **extra)

        elif self.params.sequential:
            task = SequentialTaskCollection([
                GRunApplication(self.params.args,
                                jobname='GRunApplication.%d' % i,
                                output_dir='GRunApplication.%d.d' % i,
                                **appextra)
                for i in range(self.params.sequential)
            ], **extra)

        else:
            task = GRunApplication(self.params.args, **extra)

        return [task]
예제 #3
0
    def stage0(self):
        """
        Chunk input table and run chunks in parallel
        """
        tasks = []
        for (input_file, index_chunk) in generate_chunked_files_and_list(
                self.input_table_file, self.chunk_size):
            jobname = "gbugs-%s" % (str(index_chunk))
            extra_args = self.extra.copy()
            extra_args['index_chunk'] = str(index_chunk)
            extra_args['jobname'] = jobname

            # extra_args['output_dir'] = self.params.output
            extra_args['output_dir'] = extra_args['output_dir'].replace(
                'NAME', jobname)
            extra_args['output_dir'] = extra_args['output_dir'].replace(
                'SESSION', jobname)
            extra_args['output_dir'] = extra_args['output_dir'].replace(
                'DATE', jobname)
            extra_args['output_dir'] = extra_args['output_dir'].replace(
                'TIME', jobname)

            if self.driver_script:
                extra_args['driver_script'] = self.driver_script

            gc3libs.log.debug("Creating Task for index : %d - %d" %
                              (index_chunk, (index_chunk + self.chunk_size)))

            tasks.append(GBugsApplication(input_file, **extra_args))
        return ParallelTaskCollection(tasks)
 def new_tasks(self, extra):
     fold_name = [os.path.basename(path) for path in self.params.input_dirs]
     apps = []
     for image in fold_name:
         output_dir = ("colorized-{name}.d".format(name=basename(image)))
         apps.append(GRunApplication(image, output_dir))
     task = ParallelTaskCollection(apps)
     return [task]
예제 #5
0
    def stage1(self):
        """
        Run a RICC2 job for each valid CBAS/CABS basis combination,
        re-using the results from RIDFT in `stage0`.

        If RIDFT failed, exit immediately.
        """
        # terminate if first stage was unsuccessful
        rc = self.tasks[0].execution.returncode
        if rc is not None and rc != 0:
            return rc
        # else, proceeed with 2nd pass
        pass2 = [ ]
        ridft_coord = os.path.join(self.tasks[0].turbomole_output_dir, 'coord')
        for ricc2_in in self.ricc2_ins:
            cbas = ricc2_in._keywords['CBAS_BASIS']
            cabs = ricc2_in._keywords['CABS_BASIS']
            ricc2_dir = os.path.join(self.work_dir,
                                     'cbas-%s/cabs-%s/ricc2' % (cbas, cabs))
            gc3libs.utils.mkdir(ricc2_dir)
            gc3libs.utils.copyfile(ridft_coord, ricc2_dir)
            ricc2_define_in = _make_define_in(ricc2_dir, ricc2_in)
            ricc2_output_dir = os.path.join(ricc2_dir, 'output')
            # guess duration of the RICC2 job
            extra = self.extra.copy()
            if ('aug-cc-pV5Z' == self.orb_basis
                or 'aug-cc-pV5Z' == self.rijk_basis
                or 'aug-cc-pV5Z' == cbas
                or 'aug-cc-pV5Z' == cabs):
                extra.setdefault('requested_walltime', 4*hours)
            else:
                extra.setdefault('requested_walltime', 1*hours)
            pass2.append(
                TurbomoleAndXmlProcessingPass(
                    # job name
                    ('ricc2-%s-%s-%s' % (self.name, cbas, cabs)),
                    # TURBOMOLE application to run
                    NonLocalTurbomoleDefineApplication(
                        'ricc2', ricc2_define_in,
                        # the second pass builds on files defined in the first one
                        os.path.join(ricc2_dir, 'coord'),
                        os.path.join(self.tasks[0].turbomole_output_dir, 'control'),
                        os.path.join(self.tasks[0].turbomole_output_dir, 'energy'),
                        os.path.join(self.tasks[0].turbomole_output_dir, 'mos'),
                        os.path.join(self.tasks[0].turbomole_output_dir, 'basis'),
                        os.path.join(self.tasks[0].turbomole_output_dir, 'auxbasis'),
                        output_dir = ricc2_output_dir,
                        stdout = 'ricc2.out',
                        **extra),
                    os.path.join(ricc2_output_dir, 'xml-processing'),
                    # DB parameters
                    # FIXME: make these settable on the command-line
                    db_dir='/db/home/fox/gricomp', db_user='******', db_pass='******',
                    # TaskCollection required params
                    **self.extra))
            gc3libs.log.debug("Created RICC2 task in directory '%s'", ricc2_dir)
        return (ParallelTaskCollection(self.name + '.pass2', pass2))
예제 #6
0
    def stage0(self):
        """
        Stage0: for each sample run GATK pipeline steps 1,2,3
        * 1 sample takes 24-72 hours on single core
        * GATK can be scripted to run individual steps
        * Output: 2 files per sample (g.vcf and g.vcf.idx size 1GB total)
        # 300 samples - see if we can allocate 150 cores for 2 days
        # 1 day each
        Example script:
java -jar -d64 ~/programs/GenomeAnalysisTK.jar\
     -T HaplotypeCaller\
     --emitRefConfidence GVCF\
     -minPruning 3 -stand_call_conf 30 \
     -stand_emit_conf 10 \
     -R ~/goat.genome/goat_scaffoldFG_V1.1.normalised.22.07.fa -I \
        $file -o ${samplename}.g.vcf
        """

        tasks = []

        for (bam_file,bai_file) in get_bams(self.input_bam_folder):
            extra_args = self.extra.copy()
            extra_args['sample_name'] = os.path.basename(bam_file).split('.')[0]
            extra_args['bam_filename'] = os.path.basename(bam_file)
            extra_args['bai_filename'] = os.path.basename(bai_file)
            extra_args['jobname'] = "gatk-s0-%s" % extra_args['bam_filename']

            extra_args['output_dir'] = extra_args['output_dir'].replace('NAME',
                                                                        extra_args['jobname'])
            extra_args['output_dir'] = extra_args['output_dir'].replace('SESSION',
                                                                        extra_args['jobname'])
            extra_args['output_dir'] = extra_args['output_dir'].replace('DATE',
                                                                        extra_args['jobname'])
            extra_args['output_dir'] = extra_args['output_dir'].replace('TIME',
                                                                        extra_args['jobname'])

            gc3libs.log.debug("Creating Stage0 task for : %s" %
                              (extra_args['bam_filename']))

            tasks.append(GATKS0Application(
                bam_file,
                bai_file,
                **extra_args))

        return ParallelTaskCollection(tasks)
예제 #7
0
 def new_tasks(self, extra):
     extra
     if self.params.size:
         extra['size'] = self.params.size
     gc3libs.log.info("Creating main sequential task")
     tasks = []
     for (i, input_file) in enumerate(self.params.args):
         if not os.path.isfile(input_file):
             gc3libs.log.error("Argument `%s` is NOT a file. Ignoring" %
                               input_file)
             continue
         extra_args = extra.copy()
         extra_args['output_dir'] = 'Warholized.%s' % os.path.basename(
             input_file)
         tasks.append(
             WarholizeWorkflow(input_file, self.params.copies,
                               self.params.num_colors, **extra_args))
     if not tasks:
         raise gc3libs.exceptions.InvalidUsage(
             "Missing or invalid image file.")
     return [ParallelTaskCollection(tasks, **extra)]
예제 #8
0
    def stage1(self):
        """
        Step 1: For each available statistical method, run independent application
        """
        tasks = []

        for method in STATS:
            extra_args = self.extra.copy()
            extra_args['jobname'] = method
            extra_args['results'] = self.s1_outputfolder
            extra_args['output_dir'] = extra_args['output_dir'].replace(
                'NAME', extra_args['jobname'])
            extra_args['output_dir'] = extra_args['output_dir'].replace(
                'SESSION', extra_args['jobname'])
            extra_args['output_dir'] = extra_args['output_dir'].replace(
                'DATE', extra_args['jobname'])
            extra_args['output_dir'] = extra_args['output_dir'].replace(
                'TIME', extra_args['jobname'])

            tasks.append(
                GenREMDatasetApplication(method, [self.s0_outputfolder],
                                         self.source_folder, **extra_args))
        return ParallelTaskCollection(tasks)
예제 #9
0
파일: warholize.py 프로젝트: imcf/gc3pie
 def new_tasks(self, extra):
     if self.params.size:
         extra['size'] = self.params.size
     tasks = []
     for (i, input_file) in enumerate(self.params.args):
         if not os.path.isfile(input_file):
             gc3libs.log.error("Argument `%s` is NOT a file. Ignoring",
                               input_file)
             continue
         gc3libs.log.info(
             "Creating sequential task for processing file `%s`",
             input_file)
         extra_args = extra.copy()
         extra_args['output_dir'] = os.path.join(
             extra_args.get('output_dir', os.getcwd()),
             'Warholized.' + os.path.basename(input_file)).replace(
                 '/NAME/', '/')  ## yes, it's a bug
         tasks.append(
             WarholizeWorkflow(input_file, self.params.copies,
                               self.params.num_colors, **extra_args))
     if not tasks:
         raise gc3libs.exceptions.InvalidUsage(
             "Missing or invalid image file.")
     return [ParallelTaskCollection(tasks, **extra)]
예제 #10
0
    def stage1(self):
        """
        Start this stage IIF stage0 all completed (i.e. no failures)
        combine all .g.vcf files alltogether
        group in blocks (e.g. 30 out of the total 300)
        * make grouping an option for stage1
        * Use same GATK and goat.genome vesion as in stage0
        Run "combine_gvcf" script
        script can take an arbitrary number of gvc files and prodices
        1 single gvcf file
        end of stage1: 10 .g.vcf files
        if fails - because of heap size - then re-run with more memory
        Walltime: 2days each
        Cores requires: 10 cores
        Memory 500GB memory top - need to check
        memory: 128GB
        Example script:
java -jar  /home/dleigh/GenomeAnalysisTK-3.1-1/GenomeAnalysisTK-3.4-46/GenomeAnalysisTK.jar \
    -T CombineGVCFs \
    -R /home/dleigh/goatgenome/01.GENOME/scaffold/goat_scaffoldFG_V1.1.normalised.22.07.fa \
--variant /home/dleigh/demultiplexed.reads/GATK/GR0766.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/GR1380.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/GR1387.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/GR1390.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/GR1422.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/GR1424.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/GR1440.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/GR1441.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/GR1709.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/GR1728.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/GR1938.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/GR1939.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/GR1997.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/GR2001.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/GR2053.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/GR2055.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/GR2056.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/SG0038.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/SG0047.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/SG0101.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/SG0242.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/SG0258.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/SG0261.g.vcf \
--variant /home/dleigh/demultiplexed.reads/GATK/SG0306.g.vcf \
-o /home/dleigh/demultiplexed.reads/GATK/combined3.g.vcf

        get list of all outputs in 'outputs0' folder
        group them in 's1_chunk'
        for each group run GATKS1Application
        """
        # XXX: add check if stage0 completed properly
        # Stop otherwise

        tasks = []

        for (vcf_group,index) in get_vcf_group(self.extra['S0_output'],
                                               int(self.extra['S1_group'])):
            extra_args = self.extra.copy()
            extra_args['jobname'] = "gatk-s1-%d" % index

            extra_args['output_dir'] = extra_args['output_dir'].replace('NAME',
                                                                        extra_args['jobname'])
            extra_args['output_dir'] = extra_args['output_dir'].replace('SESSION',
                                                                        extra_args['jobname'])
            extra_args['output_dir'] = extra_args['output_dir'].replace('DATE',
                                                                        extra_args['jobname'])
            extra_args['output_dir'] = extra_args['output_dir'].replace('TIME',
                                                                        extra_args['jobname'])

            gc3libs.log.debug("Creating Stage1 task for : %d" %
                              index)

            tasks.append(GATKS1Application(
                vcf_group,
                index,
                **extra_args))

        return ParallelTaskCollection(tasks)