def __init__(self, params): self.params = params task_list = [] print("GenerateVolumeImagesParallel: contacting tissumaps server") tmaps_api = TmClient(host=self.params.host, port=80, experiment_name=self.params.experiment, username=self.params.username, password=self.params.password) # find the site dimensions sites = tmaps_api.get_sites() print("found %d sites on tissuemaps", len(sites)) for site in sites: if site['plate_name'] == self.params.plate_name: well_name = site['well_name'] x = site['x'] y = site['y'] task_list.append( GenerateVolumeImagesApp( self.params.host, self.params.username, self.params.password, self.params.experiment, self.params.plate_name, well_name, x, y, self.params.threshold, self.params.mean_size, self.params.min_size, self.params.filter_type, self.params.minimum_bead_intensity, self.params.z_step, self.params.pixel_size, self.params.alpha, self.params.smooth, self.params.input_path, self.params.output_path, self.params.channel_string)) ParallelTaskCollection.__init__(self, task_list, output_dir='')
def __init__(self, param_value, input_file, output_folder, **extra): gc3libs.log.info("\t\t\tCalling InnerParallelIteration.init(%d,%s)" % (param_value,input_file)) tasks = [] self.jobname = "Gdemo_paral_"+str(param_value) extra_args = extra.copy() # XXX: do I need this ? extra_args['parent'] = self.jobname tasks.append( InnerSequentialIterationA( param_value, input_file, output_folder, iteration=0, **extra_args ) ) tasks.append( InnerSequentialIterationB( param_value, input_file, output_folder, iteration=0, **extra_args ) ) # actually init jobs ParallelTaskCollection.__init__(self, tasks, **extra)
def __init__(self, name, experiment_id, verbosity, submission_id, user_name, parent_id, description=None): ''' Parameters ---------- name: str name of the stage experiment_id: int ID of the processed experiment verbosity: int logging verbosity index submission_id: int ID of the corresponding submission user_name: str name of the submitting user parent_id: int ID of the parent :class:`Workflow <tmlib.workflow.workflow.Workflow>` description: tmlib.tmaps.description.WorkflowStageDescription, optional description of the stage (default: ``None``) ''' ParallelTaskCollection.__init__( self, tasks=None, jobname='%s' % name ) WorkflowStage.__init__( self, name=name, experiment_id=experiment_id, verbosity=verbosity, submission_id=submission_id, user_name=user_name, parent_id=parent_id, description=description )
def __init__(self, **kwargs): config = kwargs["config"] self.c = config["sequencewise_parallel_flow"] # TODO: Find all files in dir and create self.lSeq! Warning! Should be done once the # Tasks before are finished. #self.lSeq = [re.findall(self.c['retag'], i)[0] for i in os.listdir(self.c['input'])] self.lSeq = [ i for i in os.listdir( self.c['input']) if not i.endswith("fai")] self.kwargs = kwargs gc3libs.log.info( "\t\tCalling SequencewiseParallelFlow.__init({})".format( self.kwargs)) self.tasks = [ AnnotateTandemRepeats( name="annotate_tandem_repeats", param={ "$N": iSeq}, **kwargs) for iSeq in self.lSeq] ParallelTaskCollection.__init__(self, self.tasks, **kwargs)
def __init__(self, grayscaled_image, copies, ncolors, output_dir, **extra_args): gc3libs.log.info("TricolorizeMultipleImages for %d copies run" % copies) self.jobname = "Warholizer_Parallel" self.ncolors = ncolors ### XXX Why I have to use basename??? self.output_dir = os.path.join(output_dir, 'tricolorize') self.warhol_dir = output_dir # Compute a unique sequence of random combination of # colors. Please note that we can have a maximum of N!/3! if N # is len(colors) assert copies <= math.factorial(len( self.colors)) / math.factorial(ncolors) combinations = [ i for i in itertools.combinations(self.colors, ncolors) ] combinations = random.sample(combinations, copies) # Create all the single tasks self.tasks = [] for i, colors in enumerate(combinations): self.tasks.append( TricolorizeImage(os.path.relpath(grayscaled_image), "%s.%d" % (self.output_dir, i), "%s.%d" % (grayscaled_image, i), colors, self.warhol_dir, **extra_args)) ParallelTaskCollection.__init__(self, self.tasks)
def __init__(self, params): self.params = params task_list = [] tmaps_api = TmClient( host=self.params.host, port=80, experiment_name=self.params.experiment, username=self.params.username, password=self.params.password ) # find the site dimensions sites = tmaps_api.get_sites() for site in sites: if site['plate_name'] == self.params.plate: well_name = site['well_name'] x = site['x'] y = site['y'] task_list.append( GenerateVolumeImagesApp( self.params.host, self.params.username, self.params.password, self.params.experiment, self.params.plate, well_name, x, y, self.params.input_path, self.params.output_path, self.params.fname_stem ) ) ParallelTaskCollection.__init__(self, task_list, output_dir='')
def __init__(self, param_value, input_file, output_folder, **extra): gc3libs.log.info("\t\t\tCalling InnerParallelIteration.init(%d,%s)" % (param_value, input_file)) tasks = [] self.jobname = "Gdemo_paral_" + str(param_value) extra_args = extra.copy() # XXX: do I need this ? extra_args['parent'] = self.jobname tasks.append( InnerSequentialIterationA(param_value, input_file, output_folder, iteration=0, **extra_args)) tasks.append( InnerSequentialIterationB(param_value, input_file, output_folder, iteration=0, **extra_args)) # actually init jobs ParallelTaskCollection.__init__(self, tasks, **extra)
def __init__(self, grayscaled_image, copies, ncolors, output_dir, **extra_args): gc3libs.log.info( "TricolorizeMultipleImages for %d copies run" % copies) self.jobname = "Warholizer_Parallel" self.ncolors = ncolors ### XXX Why I have to use basename??? self.output_dir = os.path.join( output_dir, 'tricolorize') self.warhol_dir = output_dir # Compute a unique sequence of random combination of # colors. Please note that we can have a maximum of N!/3! if N # is len(colors) assert copies <= math.factorial(len(self.colors)) / math.factorial(ncolors) combinations = [i for i in itertools.combinations(self.colors, ncolors)] combinations = random.sample(combinations, copies) # Create all the single tasks self.tasks = [] for i, colors in enumerate(combinations): self.tasks.append(TricolorizeImage( os.path.relpath(grayscaled_image), "%s.%d" % (self.output_dir, i), "%s.%d" % (grayscaled_image, i), colors, self.warhol_dir, **extra_args)) ParallelTaskCollection.__init__(self, self.tasks)
def __init__(self, jokes, **kwargs): self.jokes = jokes gc3libs.log.info("\t\tCalling MainParallelFlow.__init({})".format(self.jokes)) self.tasks = [InnerSequentialFlow(joke) for joke in self.jokes] ParallelTaskCollection.__init__(self, self.tasks, **kwargs)
def __init__(self, img, N): apps = [] for n in range(N): col1 = random_color() col2 = random_color() col3 = random_color() output_dir = ("colorized-{name}-{nr}.d".format(name=basename(img), nr=n)) apps.append(ColorizeApp(img, col1, col2, col3, output_dir)) ParallelTaskCollection.__init__(self, apps)
def __init__(self, executable, abc_executable, inputfilelist_abc, output_folder, **extra_args): parallel_task = [] for input_file in inputfilelist_abc: name = "ABC_execution_" + os.path.basename(input_file) parallel_task.append(ABC_Application(executable, abc_executable, input_file, output_folder, **extra_args)) ParallelTaskCollection.__init__(self, name, parallel_task)
def __init__(self, executable, abc_executable, inputfilelist_abc, output_folder, **extra_args): parallel_task = [] for input_file in inputfilelist_abc: name = "ABC_execution_" + os.path.basename(input_file) parallel_task.append( ABC_Application(executable, abc_executable, input_file, output_folder, **extra_args)) ParallelTaskCollection.__init__(self, name, parallel_task)
def __init__(self, param_value, input_file_folder, output_folder, **extra): self.jobname = "Gdemo_MainParal_" + str(param_value) gc3libs.log.info("\t\tCalling MainParallelIteration.__init(%d,%s)" % (param_value, input_file_folder)) self.tasks = [] for input_file in os.listdir(input_file_folder): self.tasks.append( InnerParallelIteration(param_value, os.path.abspath(input_file), output_folder)) ParallelTaskCollection.__init__(self, self.tasks, **extra)
def __init__(self, params): self.params= params task_list = [] for experiment, input_path, output_path, fname_stem in itertools.izip(params.experiment, params.input_path, params.output_path, params.fname_stem): print(experiment, input_path, output_path, fname_stem) self.params.experiment = experiment self.params.input_path = input_path self.params.output_path = output_path self.params.fname_stem = fname_stem task_list.append( GenerateVolumeImagesParallel(self.params) ) ParallelTaskCollection.__init__(self, task_list, output_dir='')
def __init__(self, directory, pattern, task_ctor, **extra_args): tasks = [ ] for filename in os.listdir(directory): if not fnmatch.fnmatch(filename, pattern): continue pathname = os.path.join(directory, filename) tasks.append(task_ctor(pathname, **extra_args)) ParallelTaskCollection.__init__( self, # job name make_identifier("Process %s files in directory %s" % (pattern, directory)), # list of tasks to execute tasks, # boilerplate **extra_args)
def __init__(self, xVars, paraCombos, substs, optimFolder, solverParas, **sessionParas): logger.debug('entering idRiskParaSearchParallel.__init__') # create jobname self.jobname = 'evalSolverGuess' + '_' + sessionParas['jobname'] for paraCombo in paraCombos: self.jobname += str(paraCombo) self.substs = substs self.optimFolder = optimFolder self.solverParas = solverParas self.sessionParas = sessionParas tasks = self.generateTaskList(xVars, paraCombos, substs, sessionParas) ParallelTaskCollection.__init__(self, self.jobname, tasks) logger.debug('done idRiskParaSearchParallel.__init__')
def new_tasks(self, extra): appextra = extra.copy() del appextra['output_dir'] if self.params.parallel: task = ParallelTaskCollection([ GRunApplication(self.params.args, jobname='GRunApplication.%d' % i, output_dir='GRunApplication.%d.d' % i, **appextra) for i in range(self.params.parallel) ], **extra) elif self.params.sequential: task = SequentialTaskCollection([ GRunApplication(self.params.args, jobname='GRunApplication.%d' % i, output_dir='GRunApplication.%d.d' % i, **appextra) for i in range(self.params.sequential) ], **extra) else: task = GRunApplication(self.params.args, **extra) return [task]
def __init__(self, xVars, paraCombos, substs, optimFolder, solverParas, **sessionParas): logger.debug("entering idRiskParaSearchParallel.__init__") # create jobname self.jobname = "evalSolverGuess" + "_" + sessionParas["jobname"] for paraCombo in paraCombos: self.jobname += str(paraCombo) self.substs = substs self.optimFolder = optimFolder self.solverParas = solverParas self.sessionParas = sessionParas tasks = self.generateTaskList(xVars, paraCombos, substs, sessionParas) ParallelTaskCollection.__init__(self, self.jobname, tasks) logger.debug("done idRiskParaSearchParallel.__init__")
def stage0(self): """ Chunk input table and run chunks in parallel """ tasks = [] for (input_file, index_chunk) in generate_chunked_files_and_list( self.input_table_file, self.chunk_size): jobname = "gbugs-%s" % (str(index_chunk)) extra_args = self.extra.copy() extra_args['index_chunk'] = str(index_chunk) extra_args['jobname'] = jobname # extra_args['output_dir'] = self.params.output extra_args['output_dir'] = extra_args['output_dir'].replace( 'NAME', jobname) extra_args['output_dir'] = extra_args['output_dir'].replace( 'SESSION', jobname) extra_args['output_dir'] = extra_args['output_dir'].replace( 'DATE', jobname) extra_args['output_dir'] = extra_args['output_dir'].replace( 'TIME', jobname) if self.driver_script: extra_args['driver_script'] = self.driver_script gc3libs.log.debug("Creating Task for index : %d - %d" % (index_chunk, (index_chunk + self.chunk_size))) tasks.append(GBugsApplication(input_file, **extra_args)) return ParallelTaskCollection(tasks)
def new_tasks(self, extra): """ For each line of the input .csv file generate an execution Task """ tasks = [] l = 0 for parameter in self._enumerate_csv(self.params.csv_input_file): parameter_str = '.'.join(str(x) for x in parameter) parlength = len(parameter) if not parlength == 11: raise gc3libs.exceptions.InvalidUsage( "Parameter length not correct") l = l + 1 run = l jobname = "run%s" % str(l) extra_args = extra.copy() extra_args['jobname'] = jobname #Everything in results folder on remote computer extra_args['output_dir'] = CLOUDNAME #Not working #extra_args['output_dir'] = extra_args['output_dir'].replace('NAME', DEFAULT_REMOTE_OUTPUT_FOLDER) #save on local machine# extra_args['output_dir'] = "%s%s" % (extra_args['output_dir'], jobname) tasks.append( MatlabApp(self.params.matlab_function, parameter, self.params.matlab_source_folder, run, **extra_args)) return [ParallelTaskCollection(tasks, **extra)]
def __init__(self, directory, basename, pattern, **extra_args): self.directory = directory self.basename = basename self.pattern = basename + pattern self.extra_args = extra_args ParallelTaskCollection.__init__( self, # jobname make_identifier("Stage 0 of Swath workflow in directory %s processing files %s" % (directory, pattern)), # tasks [ ProcessFilesInParallel(directory, pattern, ChromaExtractLong, **extra_args), ChromaExtractShortPlusNormalization(directory, pattern, **extra_args), ], # boilerplate **extra_args)
def __init__(self, xVars, paraCombos, substs, optimFolder, solverParas, **sessionParas): logger.debug('entering idRiskParaSearchParallel.__init__') # create jobname self.jobname = 'evalSolverGuess' + '_' + sessionParas['jobname'] for paraCombo in paraCombos: self.jobname += str(paraCombo) self.substs = substs self.optimFolder = optimFolder self.solverParas = solverParas self.sessionParas = sessionParas forwardPremium.paraLoop_fp.__init__(self, verbosity = 'INFO') tasks = self.generateTaskList(xVars, paraCombos, substs, sessionParas) ParallelTaskCollection.__init__(self, self.jobname, tasks) logger.debug('done idRiskParaSearchParallel.__init__')
def __init__(self, directory, pattern, task_ctor, **extra_args): tasks = [] for filename in os.listdir(directory): if not fnmatch.fnmatch(filename, pattern): continue pathname = os.path.join(directory, filename) tasks.append(task_ctor(pathname, **extra_args)) ParallelTaskCollection.__init__( self, # job name make_identifier("Process %s files in directory %s" % (pattern, directory)), # list of tasks to execute tasks, # boilerplate **extra_args)
def new_tasks(self, extra): fold_name = [os.path.basename(path) for path in self.params.input_dirs] apps = [] for image in fold_name: output_dir = ("colorized-{name}.d".format(name=basename(image))) apps.append(GRunApplication(image, output_dir)) task = ParallelTaskCollection(apps) return [task]
def stage1(self): """ Run a RICC2 job for each valid CBAS/CABS basis combination, re-using the results from RIDFT in `stage0`. If RIDFT failed, exit immediately. """ # terminate if first stage was unsuccessful rc = self.tasks[0].execution.returncode if rc is not None and rc != 0: return rc # else, proceeed with 2nd pass pass2 = [ ] ridft_coord = os.path.join(self.tasks[0].turbomole_output_dir, 'coord') for ricc2_in in self.ricc2_ins: cbas = ricc2_in._keywords['CBAS_BASIS'] cabs = ricc2_in._keywords['CABS_BASIS'] ricc2_dir = os.path.join(self.work_dir, 'cbas-%s/cabs-%s/ricc2' % (cbas, cabs)) gc3libs.utils.mkdir(ricc2_dir) gc3libs.utils.copyfile(ridft_coord, ricc2_dir) ricc2_define_in = _make_define_in(ricc2_dir, ricc2_in) ricc2_output_dir = os.path.join(ricc2_dir, 'output') # guess duration of the RICC2 job extra = self.extra.copy() if ('aug-cc-pV5Z' == self.orb_basis or 'aug-cc-pV5Z' == self.rijk_basis or 'aug-cc-pV5Z' == cbas or 'aug-cc-pV5Z' == cabs): extra.setdefault('requested_walltime', 4*hours) else: extra.setdefault('requested_walltime', 1*hours) pass2.append( TurbomoleAndXmlProcessingPass( # job name ('ricc2-%s-%s-%s' % (self.name, cbas, cabs)), # TURBOMOLE application to run NonLocalTurbomoleDefineApplication( 'ricc2', ricc2_define_in, # the second pass builds on files defined in the first one os.path.join(ricc2_dir, 'coord'), os.path.join(self.tasks[0].turbomole_output_dir, 'control'), os.path.join(self.tasks[0].turbomole_output_dir, 'energy'), os.path.join(self.tasks[0].turbomole_output_dir, 'mos'), os.path.join(self.tasks[0].turbomole_output_dir, 'basis'), os.path.join(self.tasks[0].turbomole_output_dir, 'auxbasis'), output_dir = ricc2_output_dir, stdout = 'ricc2.out', **extra), os.path.join(ricc2_output_dir, 'xml-processing'), # DB parameters # FIXME: make these settable on the command-line db_dir='/db/home/fox/gricomp', db_user='******', db_pass='******', # TaskCollection required params **self.extra)) gc3libs.log.debug("Created RICC2 task in directory '%s'", ricc2_dir) return (ParallelTaskCollection(self.name + '.pass2', pass2))
def __init__(self, param_value, input_file_folder, output_folder, **extra): self.jobname = "Gdemo_MainParal_"+str(param_value) gc3libs.log.info("\t\tCalling MainParallelIteration.__init(%d,%s)" % (param_value,input_file_folder)) self.tasks = [] for input_file in os.listdir(input_file_folder): self.tasks.append( InnerParallelIteration( param_value, os.path.abspath(input_file), output_folder ) ) ParallelTaskCollection.__init__(self, self.tasks, **extra)
def __init__(self, title, coord, bases, jkbases, cbases, cabses, work_dir, valid1=acceptable_ridft_basis_set, valid2=acceptable_ricc2_basis_set, **extra_args): """ Create a new tasks that runs several analyses in parallel, one for each accepted combination of orbital and RIJK basis. """ extra_args.setdefault('memory', 2000) # XXX: check with `requested_memory` ridft_define_in = Template(RIDFT_DEFINE_IN, valid1, TITLE=title, ORB_BASIS=bases, RIJK_BASIS=jkbases, RIDFT_MEMORY=[extra_args['memory'] ]) # end of RIDFT template ricc2_define_in = Template( RICC2_DEFINE_IN, valid2, # the ORB_BASIS will be derived from the RIDFT_DEFINE_IN template CBAS_BASIS=cbases, CABS_BASIS=cabses, RICC2_MEMORY=[extra_args['memory']], ) # end of RICC2 template tasks = [] for ridft in expansions(ridft_define_in): orb_basis = ridft._keywords['ORB_BASIS'] tasks.append( BasisSweepPasses( title + '.seq', coord, ridft, list(expansions(ricc2_define_in, ORB_BASIS=orb_basis)), work_dir, **extra_args)) ParallelTaskCollection.__init__(self, title, tasks)
def __init__(self, pop, jobname, iteration, path_to_stage_dir, cur_pop_file, task_constructor, **extra_args): gc3libs.log.debug('entering ComputeTargetVals.__init__') # Set up initial variables and set the correct methods. self.jobname = jobname + '-' + \ 'compute_target_vals' + '-' + str(iteration) self.iteration = iteration self.path_to_stage_dir = path_to_stage_dir # ComputeTargetVals produces no output. # But attribute needs to be specified. self.output_dir = path_to_stage_dir self.cur_pop_file = cur_pop_file self.verbosity = 'DEBUG' self.extra_args = extra_args # Log activity cDate = datetime.date.today() cTime = datetime.datetime.time(datetime.datetime.now()) date_string = '%04d--%02d--%02d--%02d--%02d--%02d' % ( cDate.year, cDate.month, cDate.day, cTime.hour, cTime.minute, cTime.second) gc3libs.log.debug('Establishing parallel task on %s', date_string) # Enter an iteration specific folder self.iteration_folder = os.path.join( self.path_to_stage_dir, 'Iteration-' + str(self.iteration)) try: os.mkdir(self.iteration_folder) except OSError: print '%s already exists' % self.iteration_folder # save pop to file if cur_pop_file: np.savetxt(os.path.join(self.iteration_folder, cur_pop_file), pop, delimiter=' ') self.tasks = [ task_constructor(pop_mem, self.iteration_folder) for pop_mem in pop ] ParallelTaskCollection.__init__(self, self.tasks, **extra_args)
def __init__(self, directory, basename, pattern, **extra_args): self.directory = directory self.basename = basename self.pattern = basename + pattern self.extra_args = extra_args ParallelTaskCollection.__init__( self, # jobname make_identifier( "Stage 0 of Swath workflow in directory %s processing files %s" % (directory, pattern)), # tasks [ ProcessFilesInParallel(directory, pattern, ChromaExtractLong, **extra_args), ChromaExtractShortPlusNormalization(directory, pattern, ** extra_args), ], # boilerplate **extra_args)
def __init__(self, tests=None, **extra): """ `tests` is a list of subdirectories which must match the `RunTestsInParallel` dictionary """ if not tests: tests = self.applicationdirs else: tests = dict((k, v) for k, v in self.applicationdirs.iteritems() if k in tests) tasks = [] extra['output_dir'] = "RunTestAppsInParallel" for testdir, classes in tests.iteritems(): appdir = os.path.abspath(testdir) tasks += [ cls(appdir, **extra) for cls in classes if issubclass(cls, Task) and issubclass(cls, TestRunner)] if not tasks: raise RuntimeError("No tasks found") ParallelTaskCollection.__init__(self, tasks, **extra)
def __init__(self, **kwargs): config = kwargs["config"] self.c = config["sequencewise_parallel_flow"] # TODO: Find all files in dir and create self.lSeq! Warning! Should be done once the # Tasks before are finished. #self.lSeq = [re.findall(self.c['retag'], i)[0] for i in os.listdir(self.c['input'])] self.lSeq = [ i for i in os.listdir(self.c['input']) if not i.endswith("fai") ] self.kwargs = kwargs gc3libs.log.info( "\t\tCalling SequencewiseParallelFlow.__init({})".format( self.kwargs)) self.tasks = [ AnnotateTandemRepeats(name="annotate_tandem_repeats", param={"$N": iSeq}, **kwargs) for iSeq in self.lSeq ] ParallelTaskCollection.__init__(self, self.tasks, **kwargs)
def __init__(self, title, coord, bases, jkbases, cbases, cabses, work_dir, valid1=acceptable_ridft_basis_set, valid2=acceptable_ricc2_basis_set, **extra_args): """ Create a new tasks that runs several analyses in parallel, one for each accepted combination of orbital and RIJK basis. """ extra_args.setdefault('memory', 2000) # XXX: check with `requested_memory` ridft_define_in = Template( RIDFT_DEFINE_IN, valid1, TITLE=title, ORB_BASIS=bases, RIJK_BASIS=jkbases, RIDFT_MEMORY = [extra_args['memory']] ) # end of RIDFT template ricc2_define_in = Template( RICC2_DEFINE_IN, valid2, # the ORB_BASIS will be derived from the RIDFT_DEFINE_IN template CBAS_BASIS=cbases, CABS_BASIS=cabses, RICC2_MEMORY = [extra_args['memory']], ) # end of RICC2 template tasks = [ ] for ridft in expansions(ridft_define_in): orb_basis = ridft._keywords['ORB_BASIS'] tasks.append( BasisSweepPasses( title + '.seq', coord, ridft, list(expansions(ricc2_define_in, ORB_BASIS=orb_basis)), work_dir, **extra_args)) ParallelTaskCollection.__init__(self, title, tasks)
def stage0(self): """ Stage0: for each sample run GATK pipeline steps 1,2,3 * 1 sample takes 24-72 hours on single core * GATK can be scripted to run individual steps * Output: 2 files per sample (g.vcf and g.vcf.idx size 1GB total) # 300 samples - see if we can allocate 150 cores for 2 days # 1 day each Example script: java -jar -d64 ~/programs/GenomeAnalysisTK.jar\ -T HaplotypeCaller\ --emitRefConfidence GVCF\ -minPruning 3 -stand_call_conf 30 \ -stand_emit_conf 10 \ -R ~/goat.genome/goat_scaffoldFG_V1.1.normalised.22.07.fa -I \ $file -o ${samplename}.g.vcf """ tasks = [] for (bam_file,bai_file) in get_bams(self.input_bam_folder): extra_args = self.extra.copy() extra_args['sample_name'] = os.path.basename(bam_file).split('.')[0] extra_args['bam_filename'] = os.path.basename(bam_file) extra_args['bai_filename'] = os.path.basename(bai_file) extra_args['jobname'] = "gatk-s0-%s" % extra_args['bam_filename'] extra_args['output_dir'] = extra_args['output_dir'].replace('NAME', extra_args['jobname']) extra_args['output_dir'] = extra_args['output_dir'].replace('SESSION', extra_args['jobname']) extra_args['output_dir'] = extra_args['output_dir'].replace('DATE', extra_args['jobname']) extra_args['output_dir'] = extra_args['output_dir'].replace('TIME', extra_args['jobname']) gc3libs.log.debug("Creating Stage0 task for : %s" % (extra_args['bam_filename'])) tasks.append(GATKS0Application( bam_file, bai_file, **extra_args)) return ParallelTaskCollection(tasks)
def new_tasks(self, extra): extra if self.params.size: extra['size'] = self.params.size gc3libs.log.info("Creating main sequential task") tasks = [] for (i, input_file) in enumerate(self.params.args): if not os.path.isfile(input_file): gc3libs.log.error("Argument `%s` is NOT a file. Ignoring" % input_file) continue extra_args = extra.copy() extra_args['output_dir'] = 'Warholized.%s' % os.path.basename( input_file) tasks.append( WarholizeWorkflow(input_file, self.params.copies, self.params.num_colors, **extra_args)) if not tasks: raise gc3libs.exceptions.InvalidUsage( "Missing or invalid image file.") return [ParallelTaskCollection(tasks, **extra)]
def stage1(self): """ Step 1: For each available statistical method, run independent application """ tasks = [] for method in STATS: extra_args = self.extra.copy() extra_args['jobname'] = method extra_args['results'] = self.s1_outputfolder extra_args['output_dir'] = extra_args['output_dir'].replace( 'NAME', extra_args['jobname']) extra_args['output_dir'] = extra_args['output_dir'].replace( 'SESSION', extra_args['jobname']) extra_args['output_dir'] = extra_args['output_dir'].replace( 'DATE', extra_args['jobname']) extra_args['output_dir'] = extra_args['output_dir'].replace( 'TIME', extra_args['jobname']) tasks.append( GenREMDatasetApplication(method, [self.s0_outputfolder], self.source_folder, **extra_args)) return ParallelTaskCollection(tasks)
def new_tasks(self, extra): if self.params.size: extra['size'] = self.params.size tasks = [] for (i, input_file) in enumerate(self.params.args): if not os.path.isfile(input_file): gc3libs.log.error("Argument `%s` is NOT a file. Ignoring", input_file) continue gc3libs.log.info( "Creating sequential task for processing file `%s`", input_file) extra_args = extra.copy() extra_args['output_dir'] = os.path.join( extra_args.get('output_dir', os.getcwd()), 'Warholized.' + os.path.basename(input_file)).replace( '/NAME/', '/') ## yes, it's a bug tasks.append( WarholizeWorkflow(input_file, self.params.copies, self.params.num_colors, **extra_args)) if not tasks: raise gc3libs.exceptions.InvalidUsage( "Missing or invalid image file.") return [ParallelTaskCollection(tasks, **extra)]
def __init__(self, inParaCombos, iteration, pathToExecutable, pathToStageDir, architecture, baseDir, xVars, solverVerb, problemType, analyzeResults, ctryList, **extra_args): ''' Generate a list of tasks and initialize a ParallelTaskCollection with them. Uses paraLoop class to generate a list of (descriptions, substitutions for the input files). Descriptions are generated from variable names that are hard coded in this method right now. Uses method generateTaskList to create a list of GPremiumApplication's which are invoked from a list of inputs (appropriately adjusted input files), the output directory and some further settings for each run. inParaCombos: List of tuples defining the parameter combinations. iteration: Current iteration number. pathToExecutable: Path to the executable (the external program to be called). pathToStageDir: Root path. Usually os.getcwd() architecture: 32 or 64 bit. baseDir: Directory in which the input files are located. xVars: Names of the x variables. solverVerb: Logger verbosity. problemType: Forward premium specific flag to determine which case to look at. analyzeResults: Function to use to analyze the emerging output. ctryList: Forward premium specific list of ctrys to look at. ''' logger.debug('entering gParaSearchParalell.__init__') # Set up initial variables and set the correct methods. self.pathToStageDir = pathToStageDir self.problemType = problemType self.executable = pathToExecutable self.architecture = architecture self.baseDir = baseDir self.verbosity = solverVerb.upper() self.xVars = xVars self.n = len(self.xVars.split()) self.analyzeResults = analyzeResults self.ctryList = ctryList self.iteration = iteration self.jobname = 'evalSolverGuess' + '-' + extra_args['jobname'] + '-' + str(self.iteration) self.extra_args = extra_args tasks = [] # --- createJobs_x --- # Log activity cDate = datetime.date.today() cTime = datetime.datetime.time(datetime.datetime.now()) dateString = '{0:04d}-{1:02d}-{2:02d}-{3:02d}-{4:02d}-{5:02d}'.format(cDate.year, cDate.month, cDate.day, cTime.hour, cTime.minute, cTime.second) logger.debug('Establishing parallel task on %s' % dateString) # Enter an iteration specific folder self.iterationFolder = os.path.join(self.pathToStageDir, 'Iteration-' + str(self.iteration)) try: os.mkdir(self.iterationFolder) except OSError: print '%s already exists' % self.iterationFolder # save population to file np.savetxt(os.path.join(self.iterationFolder, 'curPopulation'), inParaCombos, delimiter = ' ') # Take the list of parameter combinations and translate them in a comma separated list of values for each variable to be fed into paraLoop file. # This can be done much more elegantly with ','.join() but it works... vals = [] nVariables = range(len(inParaCombos[0])) for ixVar in nVariables: varValString = '' for ixParaCombo, paraCombo in enumerate(inParaCombos): ### Should make more precise string conversion. varValString += str(paraCombo[ixVar]) if ixParaCombo < len(inParaCombos) - 1: varValString += ', ' vals.append( varValString ) # Make problem specific adjustments to the paraLoop file. if self.problemType == 'one4all': print 'one4all' variables = ['Ctry', 'Ctry', 'EA', 'EB', 'sigmaA', 'sigmaB'] groups = [ 0, 0, 1, 1, 1, 1 ] groupRestrs = [ 'lowerTr', 'lowerTr', 'diagnol', 'diagnol', 'diagnol', 'diagnol' ] writeVals = [ ", ".join(self.ctryList), ", ".join(self.ctryList), vals[0], vals[0], vals[1], vals[1] ] self.variables = ['EA','sigmaA'] self.paraCombos = inParaCombos paraFiles = [ 'input/markovA.in', 'input/markovB.in', 'input/parameters.in', 'input/parameters.in', 'input/parameters.in', 'input/parameters.in' ] paraFileRegex = [ 'space-separated', 'space-separated', 'bar-separated', 'bar-separated' , 'bar-separated' , 'bar-separated' ] self.analyzeResults.tablePath = self.iterationFolder elif self.problemType == 'one4eachPair': print 'one4eachPair' # Check if EA or sigmaA are alone in the specified parameters. If so make diagnol adjustments writeVals = [] if 'EA' in self.xVars and not 'EB' in self.xVars: variables = [ 'EA', 'EB' ] groups = [ '0', '0' ] groupRestrs = [ 'diagnol', 'diagnol' ] writeVals.append(vals[0]) writeVals.append(vals[0]) paraCombosEA = [ np.append(ele[0], ele[0]) for ele in inParaCombos ] if 'sigmaA' in self.xVars and not 'sigmaB' in self.xVars: variables.append( 'sigmaA') variables.append('sigmaB') groups.append( '0') groups.append('0') groupRestrs.append( 'diagnol') groupRestrs.append( 'diagnol' ) writeVals.append(vals[1]) writeVals.append(vals[1]) paraCombosSigmaA = [ np.append(ele[1], ele[1]) for ele in inParaCombos ] # match ctry with val ctryVals = {} for ixCtry, ctry in enumerate(ctryList): ctryVals[ctry] = vals self.variables = variables # Prepare paraCombos matching to resulting table. Used in analyzeOverviewTable # !!! This should be dependent on problem type or on missing variables in xvars. !!! paraCombos = [] for EA,sA in zip(paraCombosEA, paraCombosSigmaA): paraCombo = np.append(EA, sA) paraCombos.append(paraCombo) self.paraCombos = paraCombos paraFiles = [ 'input/parameters.in', 'input/parameters.in', 'input/parameters.in', 'input/parameters.in' ] paraFileRegex = [ 'bar-separated', 'bar-separated' , 'bar-separated' , 'bar-separated' ] elif self.problemType == 'one4eachCtry': print 'one4eachCtry' ctry1List = [] ctry2List = [] EAList = [] EBList = [] sigmaAList = [] sigmaBList = [] self.paraCombos = [] ctryIndices = getIndex([len(ctryList), len(ctryList)], 'lowerTr') for ixCombo in range(len(inParaCombos)): ctry1ListCombo = [] ctry2ListCombo = [] EAListCombo = [] EBListCombo = [] sigmaAListCombo = [] sigmaBListCombo = [] for ctryIndex in ctryIndices: ctry1ListCombo.append(ctryList[ctryIndex[0]]) ctry2ListCombo.append(ctryList[ctryIndex[1]]) EAListCombo.append(inParaCombos[ixCombo][0 + 2 * ctryIndex[0]]) sigmaAListCombo.append(inParaCombos[ixCombo][1 + 2 * ctryIndex[0]]) EBListCombo.append(inParaCombos[ixCombo][0 + 2 *ctryIndex[1]]) sigmaBListCombo.append(inParaCombos[ixCombo][1 + 2 * ctryIndex[1]]) self.paraCombos.append(zip(ctry1ListCombo, ctry2ListCombo, EAListCombo, sigmaAListCombo, EBListCombo, sigmaBListCombo)) ctry1List.extend(ctry1ListCombo) ctry2List.extend(ctry2ListCombo) EAList.extend(map(str, EAListCombo)) EBList.extend(map(str, EBListCombo)) sigmaAList.extend(map(str, sigmaAListCombo)) sigmaBList.extend(map(str, sigmaBListCombo)) variables = ['Ctry', 'Ctry', 'EA', 'EB', 'sigmaA', 'sigmaB'] groups = [ 0, 0, 0, 0, 0, 0 ] groupRestrs = [ 'diagnol', 'diagnol', 'diagnol', 'diagnol', 'diagnol', 'diagnol' ] writeVals = [ ", ".join(ctry1List), ", ".join(ctry2List), ", ".join(EAList), ", ".join(EBList), ", ".join(sigmaAList),", ".join(sigmaBList)] paraFiles = [ 'input/markovA.in', 'input/markovB.in', 'input/parameters.in', 'input/parameters.in', 'input/parameters.in', 'input/parameters.in' ] paraFileRegex = [ 'space-separated', 'space-separated', 'bar-separated', 'bar-separated' , 'bar-separated' , 'bar-separated' ] #self.paraCombos = inParaCombos self.analyzeResults.tablePath = self.iterationFolder # variable list passed to analyzeOverviewTables self.variables = ['EA', 'sigmaA', 'EB', 'sigmaB'] print 'Done setting up one4eachCtry. ' # Write a para.loop file to generate grid jobs para_loop = self.writeParaLoop(variables = variables, groups = groups, groupRestrs = groupRestrs, vals = writeVals, desPath = os.path.join(self.iterationFolder, 'para.loopTmp'), paraFiles = paraFiles, paraFileRegex = paraFileRegex) paraLoop_fp.__init__(self, verbosity = self.verbosity) tasks = self.generateTaskList(para_loop, self.iterationFolder) ParallelTaskCollection.__init__(self, self.jobname, tasks)
def __init__(self, num_tasks, **extra_args): tasks = [ SuccessfulApp('stage{n}'.format(n=n)) for n in range(num_tasks) ] ParallelTaskCollection.__init__(self, tasks, **extra_args)
def __init__(self, num_tasks, **extra_args): tasks = [SuccessfulApp('stage{n}'.format(n=n)) for n in range(num_tasks)] ParallelTaskCollection.__init__(self, tasks, **extra_args)
def stage1(self): """ Start this stage IIF stage0 all completed (i.e. no failures) combine all .g.vcf files alltogether group in blocks (e.g. 30 out of the total 300) * make grouping an option for stage1 * Use same GATK and goat.genome vesion as in stage0 Run "combine_gvcf" script script can take an arbitrary number of gvc files and prodices 1 single gvcf file end of stage1: 10 .g.vcf files if fails - because of heap size - then re-run with more memory Walltime: 2days each Cores requires: 10 cores Memory 500GB memory top - need to check memory: 128GB Example script: java -jar /home/dleigh/GenomeAnalysisTK-3.1-1/GenomeAnalysisTK-3.4-46/GenomeAnalysisTK.jar \ -T CombineGVCFs \ -R /home/dleigh/goatgenome/01.GENOME/scaffold/goat_scaffoldFG_V1.1.normalised.22.07.fa \ --variant /home/dleigh/demultiplexed.reads/GATK/GR0766.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/GR1380.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/GR1387.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/GR1390.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/GR1422.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/GR1424.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/GR1440.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/GR1441.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/GR1709.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/GR1728.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/GR1938.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/GR1939.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/GR1997.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/GR2001.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/GR2053.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/GR2055.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/GR2056.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/SG0038.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/SG0047.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/SG0101.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/SG0242.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/SG0258.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/SG0261.g.vcf \ --variant /home/dleigh/demultiplexed.reads/GATK/SG0306.g.vcf \ -o /home/dleigh/demultiplexed.reads/GATK/combined3.g.vcf get list of all outputs in 'outputs0' folder group them in 's1_chunk' for each group run GATKS1Application """ # XXX: add check if stage0 completed properly # Stop otherwise tasks = [] for (vcf_group,index) in get_vcf_group(self.extra['S0_output'], int(self.extra['S1_group'])): extra_args = self.extra.copy() extra_args['jobname'] = "gatk-s1-%d" % index extra_args['output_dir'] = extra_args['output_dir'].replace('NAME', extra_args['jobname']) extra_args['output_dir'] = extra_args['output_dir'].replace('SESSION', extra_args['jobname']) extra_args['output_dir'] = extra_args['output_dir'].replace('DATE', extra_args['jobname']) extra_args['output_dir'] = extra_args['output_dir'].replace('TIME', extra_args['jobname']) gc3libs.log.debug("Creating Stage1 task for : %d" % index) tasks.append(GATKS1Application( vcf_group, index, **extra_args)) return ParallelTaskCollection(tasks)
def __init__(self, executable, input_values_file, iteration, total_iterations, slice_size=0, datadir=TMPDIR, extra={ }, parent=None): """ Create a new tasks that runs `executable` over the set of values contained in file `input_values_file` (one floating-point number per line). If `slice_size` is a positive integer, then chop the input into chunks of -at most- the given size and compute them as separate independent jobs. Any other argument is passed unchanged to the `ParallelTaskCollection` ctor. """ assert slice_size >= 0, \ "Argument `slice_size` to ValueFunctionIterationPass.__init__" \ " must be a non-negative integer." assert isinstance(extra, dict), \ "Argument `extra` to ValueFunctionIterationPass.__init__" \ " must be a dictionary instance." self.input_values = input_values_file self.output_values = None total_input_values = _count_input_values(input_values_file) if slice_size < 1: # trick to make the for-loop below work in the case of one # slice only slice_size = total_input_values # pad numbers with correct amount of zeros, so they look # sorted in plain `ls -l` output fmt = '%%0%dd' % (1 + int(math.log10(float(total_iterations)))) self.jobname = ("%s.%s" % ((parent or gc3libs.utils.basename_sans(input_values_file)), (fmt % iteration))) # create data sub-directory datasubdir = os.path.join(datadir, self.jobname) if not os.path.exists(datasubdir): os.makedirs(datasubdir) # build list of tasks tasks = [ ] for start in range(0, total_input_values, slice_size): # create new job to handle this slice of values extra_args = extra.copy() extra_args['parent'] = self.jobname tasks.append( ValueFunctionIterationApplication( executable, input_values_file, iteration, total_iterations, # each task computes values with i in range # `start..end` (inclusive), and `end` is # generally `slice_size` elements after `start` start, end=min(start + slice_size - 1, total_input_values), output_dir = datasubdir, **extra_args ) ) # actually init jobs ParallelTaskCollection.__init__(self, self.jobname, tasks)