def generateDax(name="mvm", inputData=None): """Generate a Pegasus DAX abstract workflow""" dax = peg.ADAG(name) taskname = "matchedVisitMetrics" arguments = " --doraise --config instrumentName='HSC' datasetName='HSC-PDR2' " \ "doApplyExternalPhotoCalib=True doApplyExternalSkyWcs=True externalPhotoCalibName=fgcm " with open(inputData, 'r') as f: for line in f: filt, tract, visits = line.strip().split(' ') outNonRepoPath = os.path.join(outPath, tract, filt) logger.debug("add job of dataId: %s %s %s to %s", filt, tract, visits, outNonRepoPath) task = peg.Job(name=taskname) task.addArguments( inputRepo, "--output", outNonRepoPath, arguments, "--id ccd=0..8^10..103 tract=%s visit=%s" % (tract, visits)) dax.addJob(task) logfile = peg.File("%s-%s-%s.log" % (taskname, tract, filt)) dax.addFile(logfile) task.setStdout(logfile) task.setStderr(logfile) task.uses(logfile, link=peg.Link.OUTPUT) return dax
def __init__(self, executable): self.in_workflow = False self.executable = executable self._inputs = [] self._outputs = [] self._dax_node = dax.Job(name=executable.logical_name, version=executable.version, namespace=executable.namespace) self._args = [] self._options = []
def make(self, task_name, dataId=None, options=None, repo=None): job = DAX3.Job(task_name) if repo is None: repo = self.repo args = [repo] args = self._add_dataId(args, dataId) args = self._add_options(args, options) configfile = os.path.join(self.config_dir, '%s-config.py' % task_name) args.extend(['--configfile', configfile]) job.addArguments(*args) self.dax.addJob(job) if self.bin_dir is not None and self.tc is not None: self._update_tc_file(task_name) return job
def __init__(self, executable): self.in_workflow = False self.executable = executable self._inputs = [] self._outputs = [] self._dax_node = dax.Job(name=executable.logical_name, version=executable.version, namespace=executable.namespace) self._args = [] # Each value in _options is added separated with whitespace # so ['--option','value'] --> "--option value" self._options = [] # For _raw_options *NO* whitespace is added. # so ['--option','value'] --> "--optionvalue" # and ['--option',' ','value'] --> "--option value" self._raw_options = []
def main(): args = parse_args() setup_logger(args.debug) # TODO: handle execeptions for bad file paths workflow_file_path = args.cwl_workflow_file_path workflow_file_dir = os.path.dirname(workflow_file_path) log.info("Loading {}".format(workflow_file_path)) workflow = cwl.load_document(workflow_file_path) adag = dax.ADAG("dag-generated-from-cwl", auto=True) rc = ReplicaCatalog() tc = TransformationCatalog(workflow_file_dir) # process initial input file(s) # TODO: need to account for the different fields for a file class # TODO: log warning for the fields that we are skipping workflow_input_strings = dict() workflow_files = dict() log.info("Collecting inputs in {}".format(args.input_file_spec_path)) with open(args.input_file_spec_path, "r") as yaml_file: input_file_specs = load(yaml_file, Loader=Loader) for input in workflow.inputs: input_type = input.type if input_type == "File": workflow_files[get_basename(input.id)] = get_basename(input.id) # TODO: account for non-local sites rc.add_item(get_basename(input.id), input_file_specs[get_basename(input.id)]["path"], "local") elif input_type == "string": workflow_input_strings[get_basename(input.id)] = \ input_file_specs[get_basename(input.id)] elif isinstance(input_type, cwl.InputArraySchema): if input_type.items == "File": # TODO: account for workflow inputs of type File[] pass elif input_type.items == "string": workflow_input_strings[get_basename(input.id)] = \ input_file_specs[get_basename(input.id)] log.info("Collecting output files") for step in workflow.steps: cwl_command_line_tool = cwl.load_document(step.run) if isinstance(step.run, str) \ else step.run for output in cwl_command_line_tool.outputs: # TODO: account for outputs that are not files output_name = get_name(step.id, output.id) log.debug("Adding (key: {0}, value: {1}) to workflow_files".format( output_name, output.outputBinding.glob)) # TODO: throw error when glob contains javascript expression # or pattern as we cannot support anything that is dynamic workflow_files[output_name] = output.outputBinding.glob log.info("Building workflow steps into dax jobs") for step in workflow.steps: # convert cwl:CommandLineTool -> pegasus:Executable cwl_command_line_tool = cwl.load_document(step.run) if isinstance(step.run, str) \ else step.run executable_name = os.path.basename(cwl_command_line_tool.baseCommand) if \ os.path.isabs(cwl_command_line_tool.baseCommand) else cwl_command_line_tool.baseCommand dax_executable = dax.Executable(executable_name) # add executable to transformation catalog tc.add_item(executable_name, cwl_command_line_tool.baseCommand) # create job with executable dax_job = dax.Job(dax_executable) step_inputs = dict() for input in step.in_: input_id = get_basename(input.id) if isinstance(input.source, str): step_inputs[input_id] = get_basename(input.source) elif isinstance(input.source, list): step_inputs[input_id] = [ get_basename(file) for file in input.source ] # add input uses to job for input in cwl_command_line_tool.inputs: if input.type == "File": file_id = step_inputs[get_name(step.id, input.id)] file = dax.File(workflow_files[file_id]) log.debug("Adding link ({0} -> {1})".format( file_id, dax_job.name)) dax_job.uses(file, link=dax.Link.INPUT) # TODO: better type checking for string[] and File[] ? elif isinstance(input.type, cwl.CommandInputArraySchema): if input.type.items == "File": file_ids = step_inputs[get_name(step.id, input.id)] for file_id in file_ids: file = dax.File(workflow_files[file_id]) log.debug("Adding link ({0} -> {1})".format( file_id, dax_job.name)) dax_job.uses(file, link=dax.Link.INPUT) # add output uses to job # TODO: ensure that these are of type File or File[] for output in step.out: file_id = get_basename(output) file = dax.File(workflow_files[file_id]) log.debug("Adding link ({0} -> {1})".format(dax_job.name, file_id)) dax_job.uses(file, link=dax.Link.OUTPUT, transfer=True, register=True) # add arguments to job # TODO: place argument building up in a function dax_job_args = cwl_command_line_tool.arguments if \ cwl_command_line_tool.arguments is not None else [] # process cwl inputBindings if they exist and build up job argument list cwl_command_line_tool_inputs = sorted(cwl_command_line_tool.inputs, key=lambda input : input.inputBinding.position if input.inputBinding.position \ is not None else 0 ) for input in cwl_command_line_tool_inputs: # process args if input.inputBinding is not None: # TODO: account for inputBinding separation if input.inputBinding.prefix is not None: dax_job_args.append(input.inputBinding.prefix) if input.type == "File": dax_job_args.append( dax.File(workflow_files[step_inputs[get_name( step.id, input.id)]])) if input.type == "string": dax_job_args.append( workflow_input_strings[step_inputs[get_name( step.id, input.id)]]) # handle array type inputs if isinstance(input.type, cwl.CommandInputArraySchema): if input.type.items == "File": for file in step_inputs[get_name(step.id, input.id)]: dax_job_args.append(dax.File(workflow_files[file])) elif input.type.items == "string": input_string_arr_id = step_inputs[get_name( step.id, input.id)] separator = " " if input.inputBinding.itemSeparator is None \ else input.inputBinding.itemSeparator dax_job_args.append( # TODO: currently only accounting for input strings that # are inputs to the entire workflow separator.join( workflow_input_strings[input_string_arr_id])) log.debug("Adding job: {0}, with args: {1}".format( dax_job.name, dax_job_args)) dax_job.addArguments(*dax_job_args) # add job to DAG adag.addJob(dax_job) rc.write_catalog("rc.txt") tc.write_catalog("tc.txt") with open(args.output_file_path, "w") as f: log.info("Writing DAX to {}".format(args.output_file_path)) adag.writeXML(f)
import desc.imsim_deep_pipeline as idp USER = pwd.getpwuid(os.getuid())[0] # Create a abstract dag dax = DAX3.ADAG("imsim_pipeline") # Add some workflow-level metadata dax.metadata("creator", "%s@%s" % (USER, os.uname()[1])) dax.metadata("created", time.ctime()) dither_info_file = 'dither_info.pkl' sensor_lists = idp.SensorLists(dither_info_file) for visit, visit_info in sensor_lists.visits: band = visit_info.band for sensor_id in visit_info.sensor_ids: make_instcat = DAX3.Job('make_instcat') make_instcat.addArguments(visit, sensor_id) instcat = DAX3.File('instcat_%(visit)s_%(sensor_id)s.txt' % locals()) make_instcat.uses(instcat, link=DAX3.Link.OUTPUT, transfer=True, register=True) dax.addJob(make_instcat) run_imsim = DAX3.Job('run_imsim') run_imsim.uses(instcat, link=DAX3.Link.INPUT) dax.addJob(run_imsim) dax.depends(run_imsim, make_instcat) eimage = dax.File('lsst_e_%(visit)s_%(sensor_id)s_%(band)s.fits' % locals())
def generateDax(name="object", inputData=None): """Generate a Pegasus DAX abstract workflow""" dax = peg.ADAG(name) # These config-ish files are expected in the input/ folder schemaAbh = peg.File("schema.abh") dax.addFile(schemaAbh) sedScript = peg.File("fixCsv.sed") dax.addFile(sedScript) partCfg = peg.File("partition.json") dax.addFile(partCfg) catYaml = peg.File("hsc.yaml") dax.addFile(catYaml) # (Ab)using the shared filesystem....!!! chunkBaseFolder = os.path.join("/project", "hchiang2", "qserv", "qqpoc") if not os.path.isdir(chunkBaseFolder): logging.warning("Chunk file base folder %s invalid", chunkBaseFolder) # Create a new database and the Object table in Qserv task0a = peg.Job(name="replctl-register") task0a.addProfile( peg.Profile(peg.Namespace.CONDOR, "request_memory", "2GB")) task0a.addArguments("http://lsst-qserv-master03:25080", str(database), "--felis", catYaml, "-v") dax.addJob(task0a) logfile = peg.File("qingest-a.log") dax.addFile(logfile) task0a.setStdout(logfile) task0a.setStderr(logfile) task0a.uses(logfile, link=peg.Link.OUTPUT) task0a.uses(catYaml, link=peg.Link.INPUT) # Start a super-transaction # Need to get the super transaction id from the log file task0c = peg.Job(name="replctl-trans") task0c.addProfile( peg.Profile(peg.Namespace.CONDOR, "request_memory", "2GB")) task0c.addArguments("http://lsst-qserv-master03:25080", str(database), "--start") dax.addJob(task0c) transIdFile = peg.File("qingest-c.log") dax.addFile(transIdFile) task0c.setStdout(transIdFile) task0c.setStderr(transIdFile) task0c.uses(transIdFile, link=peg.Link.OUTPUT) dax.depends(parent=task0a, child=task0c) # Commit a super-transaction task0d = peg.Job(name="replctl-trans") task0d.addProfile( peg.Profile(peg.Namespace.CONDOR, "request_memory", "2GB")) task0d.addArguments("http://lsst-qserv-master03:25080", str(database), "-a") dax.addJob(task0d) logfile = peg.File("qingest-d.log") dax.addFile(logfile) task0d.setStdout(logfile) task0d.setStderr(logfile) task0d.uses(logfile, link=peg.Link.OUTPUT) i = 0 with open(inputData, 'r') as f: for line in f: inparq = line.strip() i += 1 logging.debug('Add file %d: %s', i, inparq) taskname = 'hackType' task1 = peg.Job(name=taskname) task1.addProfile( peg.Profile(peg.Namespace.CONDOR, "request_memory", "20GB")) outparq = peg.File("hack-%d.parq" % i) dax.addFile(outparq) task1.addArguments("-i", inparq, "-o", outparq) dax.addJob(task1) logfile = peg.File("%s-%s.log" % ( taskname, i, )) dax.addFile(logfile) task1.setStdout(logfile) task1.setStderr(logfile) task1.uses(logfile, link=peg.Link.OUTPUT) task1.uses(outparq, link=peg.Link.OUTPUT) taskname = 'pq2csv' task2 = peg.Job(name=taskname) task2.addProfile( peg.Profile(peg.Namespace.CONDOR, "request_memory", "20GB")) outcsv = peg.File("csv-%d.csv" % i) dax.addFile(outcsv) task2.addArguments("--schema", schemaAbh, "--verbose", outparq, outcsv) dax.addJob(task2) logfile = peg.File("%s-%s.log" % ( taskname, i, )) dax.addFile(logfile) task2.setStdout(logfile) task2.setStderr(logfile) task2.uses(logfile, link=peg.Link.OUTPUT) task2.uses(schemaAbh, link=peg.Link.INPUT) task2.uses(outparq, link=peg.Link.INPUT) task2.uses(outcsv, link=peg.Link.OUTPUT) dax.depends(parent=task1, child=task2) taskname = 'sed' task3 = peg.Job(name=taskname) task3.addProfile( peg.Profile(peg.Namespace.CONDOR, "request_memory", "2GB")) task3.addArguments("-f", sedScript, outcsv) dax.addJob(task3) logfile = peg.File("%s-%s.log" % ( taskname, i, )) newcsv = peg.File("new-%s.csv" % (i, )) dax.addFile(logfile) task3.setStdout(newcsv) task3.setStderr(logfile) task3.uses(logfile, link=peg.Link.OUTPUT) task3.uses(newcsv, link=peg.Link.OUTPUT) task3.uses(outcsv, link=peg.Link.INPUT) task3.uses(sedScript, link=peg.Link.INPUT) dax.depends(parent=task2, child=task3) # My input csv files are larger than 1GB each and I am not splitting them for now taskname = 'partition' task4 = peg.Job(name=taskname) task4.addProfile( peg.Profile(peg.Namespace.CONDOR, "request_memory", "15GB")) outdir = os.path.join(chunkBaseFolder, 'chunksSet' + str(i)) task4.addArguments("--verbose", "-c", partCfg, "--in.path", newcsv, "--out.dir", outdir) dax.addJob(task4) logfile = peg.File("%s-%s.log" % ( taskname, i, )) dax.addFile(logfile) task4.setStdout(logfile) task4.setStderr(logfile) task4.uses(logfile, link=peg.Link.OUTPUT) task4.uses(newcsv, link=peg.Link.INPUT) task4.uses(partCfg, link=peg.Link.INPUT) dax.depends(parent=task3, child=task4) # Look for chunk files in the output folder of this partitiong # Cannot handle smaller job units at dax creation as the folder is not yet populated; # if we want smaller units, consider using dynamic subworkflow taskname = 'allocateChunk' task5 = peg.Job(name=taskname) task5.addProfile( peg.Profile(peg.Namespace.CONDOR, "request_memory", "2GB")) task5.addArguments(outdir, "--idFile", transIdFile) dax.addJob(task5) logfile = peg.File("%s-%s.log" % ( taskname, i, )) dax.addFile(logfile) task5.setStdout(logfile) task5.setStderr(logfile) task5.uses(logfile, link=peg.Link.OUTPUT) task5.uses(transIdFile, link=peg.Link.INPUT) dax.depends(parent=task4, child=task5) dax.depends(parent=task0c, child=task5) taskname = 'loadData' task6 = peg.Job(name=taskname) task6.addProfile( peg.Profile(peg.Namespace.CONDOR, "request_memory", "2GB")) task6.addArguments(logfile) dax.addJob(task6) task6.uses(logfile, link=peg.Link.INPUT) logfile6 = peg.File("%s-%s.log" % ( taskname, i, )) dax.addFile(logfile6) task6.setStdout(logfile6) task6.setStderr(logfile6) task6.uses(logfile6, link=peg.Link.OUTPUT) dax.depends(parent=task5, child=task6) dax.depends(parent=task6, child=task0d) return dax
import time import Pegasus.DAX3 as DAX3 USER = pwd.getpwuid(os.getuid())[0] # Create a abstract dag dax = DAX3.ADAG("Strong Lensing Pipeline") # Add some workflow-level metadata dax.metadata("creator", "%s@%s" % (USER, os.uname()[1])) dax.metadata("created", time.ctime()) dm_level1_catalog = DAX3.File('dm_level1_catalog') dm_images = DAX3.File('dm_image_data') SL_candidates = DAX3.File('SL_candidates') SLFinder = DAX3.Job('SLFinder') SLFinder.uses(dm_level1_catalog, link=DAX3.Link.INPUT) SLFinder.uses(dm_images, link=DAX3.Link.INPUT) SLFinder.uses(SL_candidates, link=DAX3.Link.OUTPUT, register=True, transfer=True) dax.addJob(SLFinder) DESC_Lenses = DAX3.File('DESC_Lenses') SpaceWarps = DAX3.Job('SpaceWarps') SpaceWarps.uses(SL_candidates, link=DAX3.Link.INPUT) SpaceWarps.uses(DESC_Lenses, link=DAX3.Link.OUTPUT, register=True, transfer=True)
def generateDax(name="object", inputData=None): """Generate a Pegasus DAX abstract workflow""" dax = peg.ADAG(name) # These config-ish files are expected in the input/ folder schemaAbh = peg.File("schema.abh") dax.addFile(schemaAbh) sedScript = peg.File("fixCsv.sed") dax.addFile(sedScript) partCfg = peg.File("Object_new.cfg") dax.addFile(partCfg) # Note this json file has the database name....!!! tableJson = peg.File("test.json") dax.addFile(tableJson) database = "hsc_rc2_w_2020_14_00" # (Ab)using the shared filesystem....!!! chunkBaseFolder = os.path.join("/project", "hchiang2", "qserv", "qqpoc") if not os.path.isdir(chunkBaseFolder): logging.warning("Chunk file base folder %s invalid", chunkBaseFolder) # Create a new database task0a = peg.Job(name="qingest") task0a.addArguments("http://lsst-qserv-master01:25080/ingest/v1/database", "post", "--data", "database=" + str(database), "num_stripes=340 num_sub_stripes=3 overlap=0.01667") dax.addJob(task0a) logfile = peg.File("qingest-a.log") dax.addFile(logfile) task0a.setStdout(logfile) task0a.setStderr(logfile) task0a.uses(logfile, link=peg.Link.OUTPUT) # Create the Object table in Qserv task0b = peg.Job(name="qingest") task0b.addArguments("http://lsst-qserv-master01:25080/ingest/v1/table", "post", "--json", tableJson) dax.addJob(task0b) logfile = peg.File("qingest-b.log") dax.addFile(logfile) task0b.setStdout(logfile) task0b.setStderr(logfile) task0b.uses(logfile, link=peg.Link.OUTPUT) task0b.uses(tableJson, link=peg.Link.INPUT) dax.depends(parent=task0a, child=task0b) # Start a super-transaction # Need to get the super transaction id from the log file task0c = peg.Job(name="qingest") task0c.addArguments("http://lsst-qserv-master01:25080/ingest/v1/trans", "post", "--data", "database=" + str(database)) dax.addJob(task0c) transIdFile = peg.File("qingest-c.log") dax.addFile(transIdFile) task0c.setStdout(transIdFile) task0c.setStderr(transIdFile) task0c.uses(transIdFile, link=peg.Link.OUTPUT) dax.depends(parent=task0b, child=task0c) i = 0 with open(inputData, 'r') as f: for line in f: inparq = line.strip() i += 1 logging.debug('Add file %d: %s', i, inparq) taskname = 'hackType' task1 = peg.Job(name=taskname) outparq = peg.File("hack-%d.parq" % i) dax.addFile(outparq) task1.addArguments("-i", inparq, "-o", outparq) dax.addJob(task1) logfile = peg.File("%s-%s.log" % ( taskname, i, )) dax.addFile(logfile) task1.setStdout(logfile) task1.setStderr(logfile) task1.uses(logfile, link=peg.Link.OUTPUT) task1.uses(outparq, link=peg.Link.OUTPUT) taskname = 'pq2csv' task2 = peg.Job(name=taskname) outcsv = peg.File("csv-%d.csv" % i) dax.addFile(outcsv) task2.addArguments("--schema", schemaAbh, "--verbose", outparq, outcsv) dax.addJob(task2) logfile = peg.File("%s-%s.log" % ( taskname, i, )) dax.addFile(logfile) task2.setStdout(logfile) task2.setStderr(logfile) task2.uses(logfile, link=peg.Link.OUTPUT) task2.uses(schemaAbh, link=peg.Link.INPUT) task2.uses(outparq, link=peg.Link.INPUT) task2.uses(outcsv, link=peg.Link.OUTPUT) dax.depends(parent=task1, child=task2) taskname = 'sed' task3 = peg.Job(name=taskname) task3.addArguments("-f", sedScript, outcsv) dax.addJob(task3) logfile = peg.File("%s-%s.log" % ( taskname, i, )) newcsv = peg.File("new-%s.csv" % (i, )) dax.addFile(logfile) task3.setStdout(newcsv) task3.setStderr(logfile) task3.uses(logfile, link=peg.Link.OUTPUT) task3.uses(newcsv, link=peg.Link.OUTPUT) task3.uses(outcsv, link=peg.Link.INPUT) task3.uses(sedScript, link=peg.Link.INPUT) dax.depends(parent=task2, child=task3) # My input csv files are larger than 1GB each and I am not splitting them for now taskname = 'partition' task4 = peg.Job(name=taskname) outdir = os.path.join(chunkBaseFolder, 'chunksSet' + str(i)) task4.addArguments("--verbose", "-c", partCfg, "--in", newcsv, "--out.dir", outdir) dax.addJob(task4) logfile = peg.File("%s-%s.log" % ( taskname, i, )) dax.addFile(logfile) task4.setStdout(logfile) task4.setStderr(logfile) task4.uses(logfile, link=peg.Link.OUTPUT) task4.uses(newcsv, link=peg.Link.INPUT) task4.uses(partCfg, link=peg.Link.INPUT) dax.depends(parent=task3, child=task4) # Look for chunk files in the output folder of this partitiong # Cannot handle smaller job units at dax creation as the folder is not yet populated; # if we want smaller units, consider using dynamic subworkflow taskname = 'allocateChunk' task5 = peg.Job(name=taskname) task5.addArguments(outdir, "--idFile", transIdFile) dax.addJob(task5) logfile = peg.File("%s-%s.log" % ( taskname, i, )) dax.addFile(logfile) task5.setStdout(logfile) task5.setStderr(logfile) task5.uses(logfile, link=peg.Link.OUTPUT) dax.depends(parent=task4, child=task5) dax.depends(parent=task0c, child=task5) return dax
job_maker = JobMaker(dax, output_repo, config_dir, bin_dir='./bin', tc='tc.txt', clobber=True) # Ingest the raw images. ingestImages = job_maker.make('ingestImages', repo=input_repo, options={'--output': output_repo}) # Ingest the reference catalog. ref_cat = '/global/homes/d/descdm/dc1/DC1-imsim-dithered/dc1_reference_catalog.txt' ingestReferenceCatalog = DAX3.Job('ingestReferenceCatalog') ingestReferenceCatalog.addArguments(ref_cat, output_repo) dax.addJob(ingestReferenceCatalog) dax.depends(ingestReferenceCatalog, ingestImages) job_maker.add_tc_entry(job_maker, 'ingestReferenceCatalog') makeDiscreteSkyMap = job_maker.make('makeDiscreteSkyMap') # Loop over visits for visit in visit_list(output_repo): # Loop over rafts for raft in raft_list(visit): dataId = dict(visit=visit, raft=raft) processCcd = job_maker.make('processCcd', dataId=dataId) dax.depends(processCcd, ingestReferenceCatalog) dax.depends(makeDiscreteSkyMap, processCcd)