def new_tasks(self, extra): """ For each line of the input .csv file generate an execution Task """ tasks = [] l = 0 for parameter in self._enumerate_csv(self.params.csv_input_file): parameter_str = '.'.join(str(x) for x in parameter) parlength = len(parameter) if not parlength == 11: raise gc3libs.exceptions.InvalidUsage( "Parameter length not correct") l = l + 1 run = l jobname = "run%s" % str(l) extra_args = extra.copy() extra_args['jobname'] = jobname #Everything in results folder on remote computer extra_args['output_dir'] = CLOUDNAME #Not working #extra_args['output_dir'] = extra_args['output_dir'].replace('NAME', DEFAULT_REMOTE_OUTPUT_FOLDER) #save on local machine# extra_args['output_dir'] = "%s%s" % (extra_args['output_dir'], jobname) tasks.append( MatlabApp(self.params.matlab_function, parameter, self.params.matlab_source_folder, run, **extra_args)) return [ParallelTaskCollection(tasks, **extra)]
def new_tasks(self, extra): appextra = extra.copy() del appextra['output_dir'] if self.params.parallel: task = ParallelTaskCollection([ GRunApplication(self.params.args, jobname='GRunApplication.%d' % i, output_dir='GRunApplication.%d.d' % i, **appextra) for i in range(self.params.parallel) ], **extra) elif self.params.sequential: task = SequentialTaskCollection([ GRunApplication(self.params.args, jobname='GRunApplication.%d' % i, output_dir='GRunApplication.%d.d' % i, **appextra) for i in range(self.params.sequential) ], **extra) else: task = GRunApplication(self.params.args, **extra) return [task]
def stage0(self): """ Chunk input table and run chunks in parallel """ tasks = [] for (input_file, index_chunk) in generate_chunked_files_and_list( self.input_table_file, self.chunk_size): jobname = "gbugs-%s" % (str(index_chunk)) extra_args = self.extra.copy() extra_args['index_chunk'] = str(index_chunk) extra_args['jobname'] = jobname # extra_args['output_dir'] = self.params.output extra_args['output_dir'] = extra_args['output_dir'].replace( 'NAME', jobname) extra_args['output_dir'] = extra_args['output_dir'].replace( 'SESSION', jobname) extra_args['output_dir'] = extra_args['output_dir'].replace( 'DATE', jobname) extra_args['output_dir'] = extra_args['output_dir'].replace( 'TIME', jobname) if self.driver_script: extra_args['driver_script'] = self.driver_script gc3libs.log.debug("Creating Task for index : %d - %d" % (index_chunk, (index_chunk + self.chunk_size))) tasks.append(GBugsApplication(input_file, **extra_args)) return ParallelTaskCollection(tasks)
def new_tasks(self, extra): fold_name = [os.path.basename(path) for path in self.params.input_dirs] apps = [] for image in fold_name: output_dir = ("colorized-{name}.d".format(name=basename(image))) apps.append(GRunApplication(image, output_dir)) task = ParallelTaskCollection(apps) return [task]
def stage1(self): """ Run a RICC2 job for each valid CBAS/CABS basis combination, re-using the results from RIDFT in `stage0`. If RIDFT failed, exit immediately. """ # terminate if first stage was unsuccessful rc = self.tasks[0].execution.returncode if rc is not None and rc != 0: return rc # else, proceeed with 2nd pass pass2 = [ ] ridft_coord = os.path.join(self.tasks[0].turbomole_output_dir, 'coord') for ricc2_in in self.ricc2_ins: cbas = ricc2_in._keywords['CBAS_BASIS'] cabs = ricc2_in._keywords['CABS_BASIS'] ricc2_dir = os.path.join(self.work_dir, 'cbas-%s/cabs-%s/ricc2' % (cbas, cabs)) gc3libs.utils.mkdir(ricc2_dir) gc3libs.utils.copyfile(ridft_coord, ricc2_dir) ricc2_define_in = _make_define_in(ricc2_dir, ricc2_in) ricc2_output_dir = os.path.join(ricc2_dir, 'output') # guess duration of the RICC2 job extra = self.extra.copy() if ('aug-cc-pV5Z' == self.orb_basis or 'aug-cc-pV5Z' == self.rijk_basis or 'aug-cc-pV5Z' == cbas or 'aug-cc-pV5Z' == cabs): extra.setdefault('requested_walltime', 4*hours) else: extra.setdefault('requested_walltime', 1*hours) pass2.append( TurbomoleAndXmlProcessingPass( # job name ('ricc2-%s-%s-%s' % (self.name, cbas, cabs)), # TURBOMOLE application to run NonLocalTurbomoleDefineApplication( 'ricc2', ricc2_define_in, # the second pass builds on files defined in the first one os.path.join(ricc2_dir, 'coord'), os.path.join(self.tasks[0].turbomole_output_dir, 'control'), os.path.join(self.tasks[0].turbomole_output_dir, 'energy'), os.path.join(self.tasks[0].turbomole_output_dir, 'mos'), os.path.join(self.tasks[0].turbomole_output_dir, 'basis'), os.path.join(self.tasks[0].turbomole_output_dir, 'auxbasis'), output_dir = ricc2_output_dir, stdout = 'ricc2.out', **extra), os.path.join(ricc2_output_dir, 'xml-processing'), # DB parameters # FIXME: make these settable on the command-line db_dir='/db/home/fox/gricomp', db_user='******', db_pass='******', # TaskCollection required params **self.extra)) gc3libs.log.debug("Created RICC2 task in directory '%s'", ricc2_dir) return (ParallelTaskCollection(self.name + '.pass2', pass2))
def stage0(self): """ Stage0: for each sample run GATK pipeline steps 1,2,3 * 1 sample takes 24-72 hours on single core * GATK can be scripted to run individual steps * Output: 2 files per sample (g.vcf and g.vcf.idx size 1GB total) # 300 samples - see if we can allocate 150 cores for 2 days # 1 day each Example script: java -jar -d64 ~/programs/GenomeAnalysisTK.jar\ -T HaplotypeCaller\ --emitRefConfidence GVCF\ -minPruning 3 -stand_call_conf 30 \ -stand_emit_conf 10 \ -R ~/goat.genome/goat_scaffoldFG_V1.1.normalised.22.07.fa -I \ $file -o ${samplename}.g.vcf """ tasks = [] for (bam_file,bai_file) in get_bams(self.input_bam_folder): extra_args = self.extra.copy() extra_args['sample_name'] = os.path.basename(bam_file).split('.')[0] extra_args['bam_filename'] = os.path.basename(bam_file) extra_args['bai_filename'] = os.path.basename(bai_file) extra_args['jobname'] = "gatk-s0-%s" % extra_args['bam_filename'] extra_args['output_dir'] = extra_args['output_dir'].replace('NAME', extra_args['jobname']) extra_args['output_dir'] = extra_args['output_dir'].replace('SESSION', extra_args['jobname']) extra_args['output_dir'] = extra_args['output_dir'].replace('DATE', extra_args['jobname']) extra_args['output_dir'] = extra_args['output_dir'].replace('TIME', extra_args['jobname']) gc3libs.log.debug("Creating Stage0 task for : %s" % (extra_args['bam_filename'])) tasks.append(GATKS0Application( bam_file, bai_file, **extra_args)) return ParallelTaskCollection(tasks)
def new_tasks(self, extra): extra if self.params.size: extra['size'] = self.params.size gc3libs.log.info("Creating main sequential task") tasks = [] for (i, input_file) in enumerate(self.params.args): if not os.path.isfile(input_file): gc3libs.log.error("Argument `%s` is NOT a file. Ignoring" % input_file) continue extra_args = extra.copy() extra_args['output_dir'] = 'Warholized.%s' % os.path.basename( input_file) tasks.append( WarholizeWorkflow(input_file, self.params.copies, self.params.num_colors, **extra_args)) if not tasks: raise gc3libs.exceptions.InvalidUsage( "Missing or invalid image file.") return [ParallelTaskCollection(tasks, **extra)]
def stage1(self): """ Step 1: For each available statistical method, run independent application """ tasks = [] for method in STATS: extra_args = self.extra.copy() extra_args['jobname'] = method extra_args['results'] = self.s1_outputfolder extra_args['output_dir'] = extra_args['output_dir'].replace( 'NAME', extra_args['jobname']) extra_args['output_dir'] = extra_args['output_dir'].replace( 'SESSION', extra_args['jobname']) extra_args['output_dir'] = extra_args['output_dir'].replace( 'DATE', extra_args['jobname']) extra_args['output_dir'] = extra_args['output_dir'].replace( 'TIME', extra_args['jobname']) tasks.append( GenREMDatasetApplication(method, [self.s0_outputfolder], self.source_folder, **extra_args)) return ParallelTaskCollection(tasks)
def new_tasks(self, extra): if self.params.size: extra['size'] = self.params.size tasks = [] for (i, input_file) in enumerate(self.params.args): if not os.path.isfile(input_file): gc3libs.log.error("Argument `%s` is NOT a file. Ignoring", input_file) continue gc3libs.log.info( "Creating sequential task for processing file `%s`", input_file) extra_args = extra.copy() extra_args['output_dir'] = os.path.join( extra_args.get('output_dir', os.getcwd()), 'Warholized.' + os.path.basename(input_file)).replace( '/NAME/', '/') ## yes, it's a bug tasks.append( WarholizeWorkflow(input_file, self.params.copies, self.params.num_colors, **extra_args)) if not tasks: raise gc3libs.exceptions.InvalidUsage( "Missing or invalid image file.") return [ParallelTaskCollection(tasks, **extra)]
def stage1(self): """ Start this stage IIF stage0 all completed (i.e. no failures) combine all .g.vcf files alltogether group in blocks (e.g. 30 out of the total 300) * make grouping an option for stage1 * Use same GATK and goat.genome vesion as in stage0 Run "combine_gvcf" script script can take an arbitrary number of gvc files and prodices 1 single gvcf file end of stage1: 10 .g.vcf files if fails - because of heap size - then re-run with more memory Walltime: 2days each Cores requires: 10 cores Memory 500GB memory top - need to check memory: 128GB Example script: java -jar /home/dleigh/GenomeAnalysisTK-3.1-1/GenomeAnalysisTK-3.4-46/GenomeAnalysisTK.jar \ -T CombineGVCFs \ -R /home/dleigh/goatgenome/01.GENOME/scaffold/goat_scaffoldFG_V1.1.normalised.22.07.fa \ --variant /home/dleigh/demultiplexed.reads/GATK/GR0766.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/GR1380.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/GR1387.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/GR1390.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/GR1422.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/GR1424.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/GR1440.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/GR1441.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/GR1709.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/GR1728.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/GR1938.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/GR1939.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/GR1997.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/GR2001.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/GR2053.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/GR2055.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/GR2056.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/SG0038.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/SG0047.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/SG0101.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/SG0242.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/SG0258.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/SG0261.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/SG0306.g.vcf \ -o /home/dleigh/demultiplexed.reads/GATK/combined3.g.vcf get list of all outputs in 'outputs0' folder group them in 's1_chunk' for each group run GATKS1Application """ # XXX: add check if stage0 completed properly # Stop otherwise tasks = [] for (vcf_group,index) in get_vcf_group(self.extra['S0_output'], int(self.extra['S1_group'])): extra_args = self.extra.copy() extra_args['jobname'] = "gatk-s1-%d" % index extra_args['output_dir'] = extra_args['output_dir'].replace('NAME', extra_args['jobname']) extra_args['output_dir'] = extra_args['output_dir'].replace('SESSION', extra_args['jobname']) extra_args['output_dir'] = extra_args['output_dir'].replace('DATE', extra_args['jobname']) extra_args['output_dir'] = extra_args['output_dir'].replace('TIME', extra_args['jobname']) gc3libs.log.debug("Creating Stage1 task for : %d" % index) tasks.append(GATKS1Application( vcf_group, index, **extra_args)) return ParallelTaskCollection(tasks)