def example_multiple(self): workflow1 = self.example_simple() workflow2 = self.example_simple_exception1() workflow3 = self.example_simple_exception2() jobs = workflow1.jobs jobs.extend(workflow2.jobs) jobs.extend(workflow3.jobs) dependencies = list(workflow1.dependencies) dependencies.extend(workflow2.dependencies) dependencies.extend(workflow3.dependencies) param_links = dict(workflow1.param_links) param_links.update(workflow2.param_links) param_links.update(workflow3.param_links) group1 = Group(name="simple example", elements=workflow1.root_group) group2 = Group(name="simple with exception in Job1", elements=workflow2.root_group) group3 = Group(name="simple with exception in Job3", elements=workflow3.root_group) function_name = inspect.stack()[0][3] workflow = Workflow(jobs, dependencies, root_group=[group1, group2, group3], name=function_name) return workflow
def example_dynamic_outputs(self): # jobs job1 = self.job1_with_outputs1() job2 = self.job2_with_outputs1() job3 = self.job3() job4 = self.job4() # building the workflow jobs = [job1, job2, job3, job4] dependencies = [(job1, job2), (job1, job3), (job2, job4), (job3, job4)] group_1 = Group(name='group_1', elements=[job2, job3]) group_2 = Group(name='group_2', elements=[job1, group_1]) links = { job2: { 'filePathIn1': [(job1, 'filePathOut1')] }, job3: { 'filePathIn': [(job1, 'filePathOut2')] }, job4: { 'file1': [(job2, 'filePathOut')] }, } function_name = inspect.stack()[0][3] workflow = Workflow(jobs, dependencies, root_group=[group_2, job4], name=function_name, param_links=links) return workflow
def example_fake_pipelineT1(self, n_iter=100): jobs = [] dependencies = [] root_group = [] for i in range(0, n_iter): job1 = self.job_sleep(2) job1.name = "Brain extraction" jobs.append(job1) job11 = self.job_sleep(1) job11.name = "test 1" jobs.append(job11) job12 = self.job_sleep(1) job12.name = "test 2" jobs.append(job12) job13 = self.job_sleep(1) job13.name = "test 3" jobs.append(job13) job2 = self.job_sleep(2) job2.name = "Gray/white segmentation" jobs.append(job2) job3 = self.job_sleep(2) job3.name = "Left hemisphere sulci recognition" jobs.append(job3) job4 = self.job_sleep(2) job4.name = "Right hemisphere sulci recognition" jobs.append(job4) # dependencies.append((job1, job2)) dependencies.append((job1, job11)) dependencies.append((job11, job12)) dependencies.append((job12, job13)) dependencies.append((job13, job2)) dependencies.append((job2, job3)) dependencies.append((job2, job4)) group_sulci = Group(name="Sulci recognition", elements=[job3, job4]) group_subject = Group( name="sulci recognition -- subject " + repr(i), elements=[job1, job11, job12, job13, job2, group_sulci]) root_group.append(group_subject) function_name = inspect.stack()[0][3] workflow = Workflow(jobs, dependencies, root_group, name=function_name) return workflow
def _create_workflow(self, subject_ids): study_config = self._study workflow = Workflow( name='Morphologist UI - %s' % study_config.study_name, jobs=[]) workflow.root_group = [] initial_vol_format = study_config.volumes_format priority = (len(subject_ids) - 1) * 100 for subject_id in subject_ids: analysis = self._study.analyses[subject_id] subject = self._study.subjects[subject_id] analysis.set_parameters(subject) #analysis.propagate_parameters() pipeline = analysis.pipeline pipeline.enable_all_pipeline_steps() # force highest priority normalization method # FIXME: specific knowledge of Morphologist should not be used here. pipeline.Normalization_select_Normalization_pipeline \ = 'NormalizeSPM' pipeline_tools.disable_runtime_steps_with_existing_outputs( pipeline) missing = pipeline_tools.nodes_with_missing_inputs(pipeline) if missing: self.check_missing_models(pipeline, missing) print('MISSING INPUTS IN NODES:', missing) raise MissingInputFileError("subject: %s" % subject_id) wf = pipeline_workflow.workflow_from_pipeline( pipeline, study_config=study_config, jobs_priority=priority) njobs = len([j for j in wf.jobs if isinstance(j, Job)]) if njobs != 0: priority -= 100 workflow.jobs += wf.jobs workflow.dependencies += wf.dependencies group = Group(wf.root_group, name='Morphologist %s' % str(subject)) group.user_storage = subject_id workflow.root_group.append(group) # += wf.root_group workflow.groups += [group] + wf.groups return workflow
def example_simple_exception2(self): # jobs job1 = self.job1() job2 = self.job2() job4 = self.job4() job3 = self.job3_exception() jobs = [job1, job2, job3, job4] dependencies = [(job1, job2), (job1, job3), (job2, job4), (job3, job4)] group_1 = Group(name='group_1', elements=[job2, job3]) group_2 = Group(name='group_2', elements=[job1, group_1]) function_name = inspect.stack()[0][3] workflow = Workflow(jobs, dependencies, root_group=[group_2, job4], name=function_name) return workflow
def process_group(group, to_remove, name): new_group = [] for element in group: if isinstance(element, Job): if element in to_remove: if not to_remove[element] in new_group: new_group.append(to_remove[element]) else: new_group.append(element) else: new_group.append( process_group(element.elements, to_remove, element.name)) return Group(new_group, name)
def example_n_jobs_with_dependencies(self, nb=500, time=60): dependencies = [] jobs = [] intermed_job1 = self.job_sleep(2) jobs.append(intermed_job1) intermed_job2 = self.job_sleep(2) jobs.append(intermed_job2) elem_group1 = [] for i in range(0, nb): job = self.job_sleep(time) jobs.append(job) elem_group1.append(job) dependencies.append((job, intermed_job1)) group1 = Group(name="Group 1", elements=elem_group1) elem_group2 = [] for i in range(0, nb): job = self.job_sleep(time) jobs.append(job) elem_group2.append(job) dependencies.append((intermed_job1, job)) dependencies.append((job, intermed_job2)) group2 = Group(name="Group 2", elements=elem_group2) elem_group3 = [] for i in range(0, nb): job = self.job_sleep(time) jobs.append(job) elem_group3.append(job) dependencies.append((intermed_job2, job)) group3 = Group(name="Group 3", elements=elem_group3) root_group = [group1, intermed_job1, group2, intermed_job2, group3] function_name = inspect.stack()[0][3] workflow = Workflow(jobs, dependencies, root_group, name=function_name) return workflow
def example_dynamic_outputs_with_mapreduce_jobs(self): # small map/reduce using MapJob / ReduceJob # jobs job1 = self.job_list_with_outputs() job2_0 = self.job8_with_output() job2_0.name = 'job2_0' job2_1 = self.job8_with_output() job2_1.name = 'job2_1' job3 = self.job_reduce_cat() map_job = MapJob(referenced_input_files=job1.referenced_output_files, name='map') reduce_job = ReduceJob() # building the workflow jobs = [job1, job2_0, job2_1, job3, map_job, reduce_job] dependencies = [] group_1 = Group(name='group_1', elements=[job2_0, job2_1]) links = { map_job: { 'inputs': [(job1, 'outputs')] }, job2_0: { 'input': [(map_job, 'output_0')] }, job2_1: { 'input': [(map_job, 'output_1')] }, reduce_job: { 'input_0': [(job2_0, 'output')], 'input_1': [(job2_1, 'output')], 'lengths': [(map_job, 'lengths')] }, job3: { 'inputs': [(reduce_job, 'outputs')] }, } function_name = inspect.stack()[0][3] workflow = Workflow( jobs, dependencies, root_group=[job1, map_job, group_1, reduce_job, job3], name=function_name, param_links=links) return workflow
def example_dynamic_outputs_with_mapreduce(self): # small map/reduce # jobs job1 = self.job_list_with_outputs() job2_0 = self.job8_with_output() job2_0.name = 'job2_0' job2_1 = self.job8_with_output() job2_1.name = 'job2_1' job3 = self.job_reduce_cat() # building the workflow jobs = [job1, job2_0, job2_1, job3] dependencies = [] group_1 = Group(name='group_1', elements=[job2_0, job2_1]) links = { job2_0: { 'input': [(job1, 'outputs', ('list_to_sequence', 0))] }, job2_1: { 'input': [(job1, 'outputs', ('list_to_sequence', 1))] }, job3: { 'inputs': [(job2_0, 'output', ('sequence_to_list', 0)), (job2_1, 'output', ('sequence_to_list', 1))] }, } function_name = inspect.stack()[0][3] workflow = Workflow(jobs, dependencies, root_group=[job1, group_1, job3], name=function_name, param_links=links) return workflow
def export(self, script_path): try: from soma_workflow.client import Job from soma_workflow.client import Group from soma_workflow.client import Workflow from soma_workflow.client import SharedResourcePath from soma_workflow.client import Helper except ImportError: errmsg = "No soma-workflow is found. "\ "Please verify your soma-worklow"\ "on your computer (e.g. PYTHONPATH) \n" sys.stderr.write(errmsg) sys.stdout.write(errmsg) raise NoSomaWFError # dataset on remote machine dataset_dir = SharedResourcePath( relative_path=self.dataset_relative_path, namespace=self.namespace, uuid=self.uuid) # Tree on remote machine epac_tree_dir = SharedResourcePath( relative_path=self.tree_relative_path, namespace=self.namespace, uuid=self.uuid) # Reduce output on remote machine out_dir = SharedResourcePath(relative_path=self.output_relative_path, namespace=self.namespace, uuid=self.uuid) # workflow file for soma-workflow soma_workflow_file = script_path # iterate all key jobs job_paths = [] for root, _, files in os.walk( os.path.join(self.root, self.jobs_relative_path)): for f in files: _, ext = os.path.splitext(f) if ext == ".job": job_paths.append(f) # Building mapper task dependencies = [] map_jobs = [] for i in range(len(job_paths)): job_relative_path = os.path.join(self.jobs_relative_path, job_paths[i]) key_path = SharedResourcePath(relative_path=job_relative_path, namespace=self.namespace, uuid=self.uuid) map_cmd = [] map_cmd.append("epac_mapper") map_cmd.append("--datasets") map_cmd.append(dataset_dir) map_cmd.append("--keysfile") map_cmd.append(key_path) map_cmd.append("--treedir") map_cmd.append(epac_tree_dir) map_job = Job(command=map_cmd, name="map_step", referenced_input_files=[], referenced_output_files=[]) map_jobs.append(map_job) group_map_jobs = Group(elements=map_jobs, name="all map jobs") # Building reduce step reduce_cmd = [] reduce_cmd.append("epac_reducer") reduce_cmd.append("--treedir") reduce_cmd.append(epac_tree_dir) reduce_cmd.append("--outdir") reduce_cmd.append(out_dir) reduce_job = Job(command=reduce_cmd, name="reduce_step", referenced_input_files=[], referenced_output_files=[]) for map_job in map_jobs: dependencies.append((map_job, reduce_job)) jobs = map_jobs + [reduce_job] # Build workflow and save into disk workflow = Workflow(jobs=jobs, dependencies=dependencies, root_group=[group_map_jobs, reduce_job]) Helper.serialize(soma_workflow_file, workflow)
def export(self, workflow_dir, num_processes): ''' Parameters ---------- workflow_dir: string the directory to export workflow num_processes: integer the number of processes you want to run ''' try: from soma_workflow.client import Job from soma_workflow.client import Group from soma_workflow.client import Workflow from soma_workflow.client import Helper except ImportError: errmsg = "No soma-workflow is found. "\ "Please verify your soma-worklow"\ "on your computer (e.g. PYTHONPATH) \n" sys.stderr.write(errmsg) sys.stdout.write(errmsg) raise NoSomaWFError self.workflow_dir = workflow_dir soma_workflow_file = os.path.join(self.workflow_dir, "soma_workflow") if not os.path.exists(self.workflow_dir): os.makedirs(self.workflow_dir) tree_root = load_tree(self.epac_tree_dir_path) keysfile_list = export_jobs(tree_root, num_processes, workflow_dir) # Building mapper task dependencies = [] map_jobs = [] for i in range(len(keysfile_list)): key_path = os.path.join(workflow_dir, keysfile_list[i]) map_cmd = [] map_cmd.append("epac_mapper") map_cmd.append("--datasets") map_cmd.append(self.dataset_dir_path) map_cmd.append("--keysfile") map_cmd.append(key_path) map_cmd.append("--treedir") map_cmd.append(self.epac_tree_dir_path) map_job = Job(command=map_cmd, name="map_step", referenced_input_files=[], referenced_output_files=[]) map_jobs.append(map_job) group_map_jobs = Group(elements=map_jobs, name="all map jobs") # Building reduce task reduce_cmd = [] reduce_cmd.append("epac_reducer") reduce_cmd.append("--treedir") reduce_cmd.append(self.epac_tree_dir_path) reduce_cmd.append("--outdir") reduce_cmd.append(self.out_dir_path) reduce_job = Job(command=reduce_cmd, name="reduce_step", referenced_input_files=[], referenced_output_files=[]) for map_job in map_jobs: dependencies.append((map_job, reduce_job)) jobs = map_jobs + [reduce_job] # Build workflow and save into disk workflow = Workflow(jobs=jobs, dependencies=dependencies, root_group=[group_map_jobs, reduce_job]) Helper.serialize(soma_workflow_file, workflow)
group_score += jobs_score group_significativity += jobs_perm group_merge.append(job_merge) if count != 1: relationships = zip(base_group, jobs_score) for relation in relationships: dependencies.append(relation) dependencies.append((job_merge, job_final)) count -= 1 jobs += group_score + group_significativity + group_merge jobs.append(job_final) scores = Group(elements=group_score, name="group where test scores are calculated") significativity = Group( elements=group_significativity, name="group where distributions are calculated for significance") merge = Group(elements=group_merge, name="group where we merge results") workflow = Workflow(jobs=jobs, dependencies=dependencies, root_group=[scores, significativity, merge, job_final]) Helper.serialize( os.path.join(inputs_path, 'optimized_cluster_part_2.somawf'), workflow) ### Submit the workflow to computing resource (configured in the client-server mode)
def prepare_treatment_jobs(treatment, tmp_local_dir, local_result_path, local_user, local_host, remote_host, remote_user, remote_path, label_for_cluster): """Prepare soma-workflow jobs to perform one treatment (i.e., one subject). Parameters ---------- treatment : FMRITreatment the treatment defining the analysis tmp_local_dir : str a path where to store the temporary config file before sending it to the remote host local_result_path : str path where to store the final result local_user : str the user on the local host who enables SHH connection from the remote cluster local_host : str local host (used to send back the result) remote_host : str remote machine where the treatment will be run remote_user : str user login on the remote machine. remote_path : str path on the remote machine where to store ROI data and analysis results label_for_cluster : str label prefix to name job in soma-workflow Returns ------- a tuple (job_split, jobs, dependencies, mainGroup) job_split (Job) job handling splitting of input data into ROI data jobs (list of Job) all jobs except the splitting jobs -> roi analyses, result merge, scp of result back to local host, data cleaning dependencies (list of job pairs) define the pipeline structure mainGroup (Group) top-level object gathering all jobs for this treatment. """ # roiFiles contains the list of files that will be produced by job_split roiFiles, roiIds = treatment.dump_roi_datasets(dry=True) logger.info('Get list of splitted data files ... %d files', len(roiFiles)) datafiles = treatment.get_data_files() # Make all path be relative in the treatment config file # so that data file can be found on the cluster file system treatment.replace_data_dir('./') remote_cfg_file = op.join(tmp_local_dir, './detectestim_remote.xml') treatment.set_init_param('make_outputs', False) logger.info('Save remote treatment to %s', remote_cfg_file) save_treatment(treatment, remote_cfg_file) logger.info('Upload input data') # All data which are the inputs of the workflow: data_to_upload = datafiles + [remote_cfg_file] remote_input_files = remote_copy(data_to_upload, remote_host, remote_user, remote_path) logger.info('Remove tmp remote cfg file') os.remove(remote_cfg_file) logger.info('Prepare jobs ...') logger.info('Job split ...') verbose_level = logger.getEffectiveLevel() cmd = [ "pyhrf_split_roidata", "-c", basename(remote_cfg_file), "-v %d" % verbose_level, "-d", "./" ] logger.info('-> %s', cmd) job_split = Job(cmd, working_directory=remote_path, name="roi_split") logger.info('Jobs JDE ...') jobs_jde = [ Job([ "pyhrf_jde_estim", "-c", basename(remote_cfg_file), "-r", basename(roiFile), "-v %d" % verbose_level ], working_directory=remote_path, name="jde_r%04d" % roiId) for roiFile, roiId in zip(roiFiles, roiIds) ] logger.info('First jde job -> %s', jobs_jde[0].command) # Files produced by all JDE jobs, which will be then used as input of the # merge job: resultFiles = ["result_%04d.pck" % iroi for iroi in roiIds] logger.info('Job pack result ...') # Output of the merge job, which has to transfered back to local: remote_resultFile = './result.pck' logger.info('Remote result file: %s', remote_resultFile) cmd = ["pyhrf_pack_results", '-v1', '-o', remote_resultFile] + resultFiles logger.info('cmd pack result: %s', cmd) job_merge = Job(cmd, working_directory=remote_path, name="merge_results") # Retrieve result file: # local_host = "132.166.200.5" #HACK # cmd = ["pyhrf_shell_cmd", "scp","-C",remote_resultFile, "%s@%s:\"%s\"" \ #%(local_user,local_host,local_result_path)] cmd = [ "scp", "-C", remote_resultFile, "%s@%s:\"%s\"" % (local_user, local_host, local_result_path) ] logger.info('cmd scp result: %s', cmd) job_scp_result = Job(cmd, working_directory=remote_path, name="scp_result") # Clean everything: # -> all input files, splitted roi data, result for each data, merged result: cmd = ["rm", "-f", remote_resultFile] + \ map(basename, roiFiles) + resultFiles + remote_input_files logger.info('cmd clean: %s', cmd) job_clean = Job(cmd, working_directory=remote_path, name="clean_files") logger.info('Setup of work flow ...') # Build the Job lists, dependencies and group clean = True if clean: nodes = [job_merge, job_scp_result, job_clean] + jobs_jde else: nodes = [job_merge, job_scp_result] + jobs_jde dependencies = [] for jj in jobs_jde: dependencies.append((job_split, jj)) dependencies.append((jj, job_merge)) dependencies.append((job_merge, job_scp_result)) if clean: dependencies.append((job_scp_result, job_clean)) jjGroup = Group(elements=jobs_jde, name=label_for_cluster + '-roi_jobs') if clean: elements = [job_split, jjGroup, job_merge, job_scp_result, job_clean] else: elements = [job_split, jjGroup, job_merge, job_scp_result] mainGroup = Group(name=label_for_cluster, elements=elements) return job_split, nodes, dependencies, mainGroup
dependencies.append((job, job_merge)) jobs.append(job_merge) # Plotting the maps job_final = Job(command=[ "python", "create_maps.py", "--input", derivatives_path, "--parameters", parameters_path, "--subject", args.subject, "--fmri_data", fmri_path ], name="Creating the maps.", working_directory=scripts_path) jobs.append(job_final) dependencies.append((job_merge, job_final)) cv_alphas = Group(elements=group_cv_alphas, name="CV on alphas") significativity = Group(elements=group_significativity, name="Fit of the models with best alphas") workflow = Workflow( jobs=jobs, dependencies=dependencies, root_group=[job_0, cv_alphas, significativity, job_merge, job_final]) Helper.serialize(os.path.join(inputs_path, 'cluster_jobs.somawf'), workflow) ### Submit the workflow to computing resource (configured in the client-server mode) controller = WorkflowController( "DSV_cluster_{}".format(login), login,
str(cv_index) ], name="Alphas CV - split {} - {}".format(run, cv_index), working_directory=scripts_path, native_specification="-q Nspin_bigM") cv_index += 1 group_cv_alphas.append(job) jobs.append(job) dependencies.append((job_0, job)) dependencies.append((job, job_merge_cv)) group_cv_merge.append(job_merge_cv) jobs_tmp.append(job_merge_cv) jobs += jobs_tmp cv_alphas = Group(elements=group_cv_alphas, name="CV on alphas") cv_merge = Group(elements=group_cv_merge, name="merge CV results") workflow = Workflow(jobs=jobs, dependencies=dependencies, root_group=[job_0, cv_alphas, cv_merge]) Helper.serialize( os.path.join(inputs_path, 'optimized_cluster_part_1.somawf'), workflow) ### Submit the workflow to computing resource (configured in the client-server mode) controller = WorkflowController( "DSV_cluster_{}".format(login), login, password) #"DSV_cluster_ap259944", login, password
dependencies.append((fit, transform)) inner_fold_jobs.append(transform) # Just for grouping # Predict task predict_cmd = predict_command(param_transform_files, param_prediction_file) job_name = common_job_name + "/predict" predict = Job(command=predict_cmd, name=job_name) jobs.append(predict) dependencies.append((transform, predict)) inner_fold_jobs.append(predict) # Just for grouping # Set dependencies of cleaning job dependencies.append((predict, clean_job)) # End loop on params # Group all jobs of this fold in a group group_elements.append( Group(elements=inner_fold_jobs, name="Outer fold {out}/Inner fold {inn}".format( out=outer_fold_index, inn=inner_fold_index))) # End inner loop # End outer loop workflow = Workflow(jobs=jobs, dependencies=dependencies, root_group=group_elements, name=WF_NAME) # save the workflow into a file Helper.serialize(os.path.join(OUT_DIR, WF_NAME), workflow)
group_elements = [] first_job = Job(command=["sleep", "10"], name="first job") last_job = Job(command=["sleep", "10"], name="last job") jobs.append(first_job) jobs.append(last_job) for i in range(0, 30): job = Job(command=["sleep", "60"], name="job " + repr(i)) jobs.append(job) dependencies.append((first_job, job)) dependencies.append((job, last_job)) group_elements.append(job) thirty_jobs_group = Group(elements=group_elements, name="my 30 jobs") workflow = Workflow(jobs=jobs, dependencies=dependencies, root_group=[first_job, thirty_jobs_group, last_job]) login = '******' password = '******' controller = WorkflowController("DSV_cluster", login, password) controller.submit_workflow(workflow=workflow, name="Simple workflow with group")
run = int(info[1]) alpha = int(info[3]) job = Job(command=[ "python", "significance_clusterized.py", "--yaml_file", yaml_file, "--output_r2", os.path.join(args.output, 'r2'), "--output_distribution", os.path.join(args.output, 'distribution'), "--x", args.x, "--y", args.y, "--shuffling", shuffling, "--n_permutations", args.nb_permutations, "--alpha_percentile", args.alpha_percentile ], name="job {} - alpha {}".format(run, alpha), working_directory=scripts_path) group_significativity.append(job) jobs.append(job) distribution_voxels = Group(elements=group_significativity, name="Voxel wise fitting of the models") workflow2 = Workflow(jobs=jobs, root_group=[distribution_voxels]) ### Submit the workflow to computing resource (configured in the client-server mode) controller2 = WorkflowController( "DSV_cluster_ap259944", args.login, args.password) #"DSV_cluster_ap259944", args.login, args.password workflow_id2 = controller2.submit_workflow(workflow=workflow2, name="Voxel-wise computations") # You may use the gui or manually transfer the files: manual = True if manual: