def GenerateTask(tcfg, ecfg, pipe_name, stage_name, task_name): # Initialize a task object t = Task() # Define magic variable dictionary mvar_dict = {"PIPELINE_ID": pipe_name} # Give this task object a name t.name = task_name # Pre exec let you load modules, set environment before executing the workload if tcfg['pre_exec'] != "": t.pre_exec = [tcfg['pre_exec']] # Executable to use for the task t.executable = tcfg['executable'] # If there's a user-defined input file (likely for genmod modules), add it to the # options list and upload file list if needed if "input_data_file" in tcfg['options']: tcfg['upload_input_data'].append( os.path.join(ecfg['exp_dir'], "input", ecfg['input_data_file'])) # List of arguments for the executable t.arguments = [tcfg['script']] + match_options(tcfg['options'], ecfg['options']) # CPU requirements for this task t.cpu_threads = { 'processes': tcfg['cpu']['processes'], 'process-type': tcfg['cpu']['process-type'], 'threads-per-process': tcfg['cpu']['threads-per-process'], 'thread-type': tcfg['cpu']['thread-type'], } # Upload data from your local machine to the remote machine # Note: Remote machine can be the local machine t.upload_input_data = tcfg['upload_input_data'] # Copy data from other stages/tasks for use in this task copy_list = [] if "copy_input_data" in tcfg.keys(): for copy_stage in tcfg['copy_input_data'].keys(): for copy_task in tcfg['copy_input_data'][copy_stage].keys(): loc = "$Pipeline_{0}_Stage_{1}_Task_{2}".format( pipe_name, copy_stage, copy_task) copy_list.extend([ '{0}/{1}'.format(loc, mvar_replace_dict(mvar_dict, x)) for x in tcfg['copy_input_data'][copy_stage][copy_task] ]) # Append the copy list (if any) to the task object t.copy_input_data = copy_list # Set the download data for the task download_list = [] outdir = os.path.join(ecfg['exp_dir'], "output") if "download_output_data" in tcfg.keys(): download_list.extend([ '{0} > {1}/{0}'.format(mvar_replace_dict(mvar_dict, x), outdir) for x in tcfg['download_output_data'] ]) # Append the download list to this task t.download_output_data = download_list # Return the task object return (t)
def generate_pipeline(cfg): cfg_file = cfg['run_cfg_file'] # resource and workload config run_file = cfg['run_file'] # runs for this campaign # setup S1 workload cfg = ru.Config(cfg=ru.read_json(cfg_file)) runs = check_runs(cfg_file, run_file) if not runs: print('S1: nothing to run, exiting.') return # for each run in the campaign: # - create cfg with requested receptor and smiles # - create a number of masters as EnTK tasks and add them to a pipeline # - submit configured number of masters with that cfg # setup EnTK pipeline p = Pipeline() p.name = 'S1-RAPTOR' s = Stage() # create cfg subs = dict() rurl = cfg.fs_url + cfg.workload.results d = rs.filesystem.Directory(rurl) ls = [str(u).split('/')[-1] for u in d.list()] workload = cfg.workload for receptor, smiles, nodes, runtime in runs: print('%30s %s' % (receptor, smiles)) name = '%s_-_%s' % (receptor, smiles) tgt = '%s.%s.gz' % (name, workload.output) # rec = False # if tgt in ls: # if workload.recompute: # rec += 1 # d.move(tgt, tgt + '.bak') # else: # print('skip 1 %s' % name) # continue # if smiles in ls: # if smiles not in subs: # subs[smiles] = [str(u).split('/')[-1] for u in d.list('%s/*' % smiles)] # if tgt in subs[smiles]: # if workload.recompute: # rec += 2 # d.move('%s/%s' % (smiles, tgt), # '%s/%s.bak' % (smiles, tgt)) # else: # print('skip 2 %s' % name) # continue ## if os.path.exists('results/%s.%s.gz' % (name, wofkload.output)): ## print('skip 3 %s' % name) ## continue #if rec: print('recompute %d %s' % (rec, name)) #else : print('compute 2 %s' % name) cpn = cfg.cpn gpn = cfg.gpn n_masters = cfg.n_masters cfg.workload.receptor = receptor cfg.workload.smiles = smiles cfg.workload.name = name cfg.nodes = nodes cfg.runtime = runtime cfg.n_workers = int(nodes / n_masters - 1) print('n_workers: %d' % cfg.n_workers) ru.write_json(cfg, 'configs/wf0.%s.cfg' % name) for i in range(n_masters): t = Task() t.pre_exec = [ '. /gpfs/alpine/scratch/mturilli1/med110/radical.pilot.sandbox/s1.to/bin/activate' ] t.executable = "python3" t.arguments = ['wf0_master.py', i] t.cpu_threads = cpn t.upload_input_data = [ 'wf0_master.py', 'wf0_worker.py', 'configs/wf0.%s.cfg > wf0.cfg' % name, 'read_ligand_dict.py' ] t.link_input_data = ['%s > input_dir' % workload.input_dir] t.download_output_data = [ '%s.%s.gz > results/%s.%s.gz' % (name, workload.output, name, workload.output) ] # t.input_staging = [{'source': 'wf0_master.py', # 'target': 'wf0_master.py', # 'action': rp.TRANSFER, # 'flags' : rp.DEFAULT_FLAGS}, # {'source': 'wf0_worker.py', # 'target': 'wf0_worker.py', # 'action': rp.TRANSFER, # 'flags' : rp.DEFAULT_FLAGS}, # {'source': 'configs/wf0.%s.cfg' % name, # 'target': 'wf0.cfg', # 'action': rp.TRANSFER, # 'flags' : rp.DEFAULT_FLAGS}, # {'source': workload.input_dir, # 'target': 'input_dir', # 'action': rp.LINK, # 'flags' : rp.DEFAULT_FLAGS}, # {'source': workload.impress_dir, # 'target': 'impress_md', # 'action': rp.LINK, # 'flags' : rp.DEFAULT_FLAGS}, # {'source': 'read_ligand_dict.py', # 'target': 'read_ligand_dict.py', # 'action': rp.TRANSFER, # 'flags' : rp.DEFAULT_FLAGS}, # ] # t.output_staging = [{'source': '%s.%s.gz' % (name, workload.output), # 'target': 'results/%s.%s.gz' % (name, workload.output), # 'action': rp.TRANSFER, # 'flags' : rp.DEFAULT_FLAGS}] s.add_tasks(t) p.add_stages(s) return p