def generate_dax(self, daxfile): from Pegasus.DAX3 import ADAG, Job, File, Link # The DAX generator dax = ADAG("pipeline") # Some bits of metadata. Shoulf put plenty more here. dax.metadata("owner", self.pipeline.owner) dax.metadata("basename", self.pipeline.basename) dax.metadata("version", self.pipeline.version) # string tag -> pegasus File object mapping of all the # inputs and outputs used by any pipeline stage. files = {} # First generate the overall inputs to the pipeline, # i.e. ones that are not generated by any other stage # but must be specified at the start for tag in self.pipeline.input_tags(): path = self.info['inputs'].get(tag) files[tag] = File(path) # Now go through the pipeline in sequence. for stage_name, stage_class in self.pipeline.sequence(): # The stage in the pipeline. We describe the meaning of it # (which image it corresponds to) # in the transformation catalog generation job = Job(stage_name, id=stage_name) # Configuration files for this job. # These will not be built during the pipeline and must be # provided by the user for config_tag, config_filename in stage_class.config.items(): filename = self.pipeline.cfg[stage_name]['config'][config_tag] config_path = os.path.join(self.config_dir(), filename) config = File(config_path) job.uses(config, link=Link.INPUT) # Input files for the job, either created by the user or by previous # stages. In either case they should be in the "files" dictionary, because # precursor jobs will have been added before this one. for input_tag in stage_class.inputs.keys(): job.uses(files[input_tag], link=Link.INPUT) # Output files from the job. These will be created by the job # and used by future jobs for output_tag, output_type in stage_class.outputs.items(): output_filename = "{}.{}".format(output_tag, output_type) output = File(output_filename) job.uses(output, link=Link.OUTPUT, transfer=True, register=True) files[output_tag] = output # Add this job to the pipeline dax.addJob(job) # Tell pegasus which jobs this one depends on. # The pipeline already knows this information. # The pipeline.sequence command runs through # the jobs in an order that guarantees that a job's predecessors are # always done before it is, so they will always exist in the dax by this point. for predecessor_name in self.pipeline.dependencies(stage_name): dax.depends(stage_name, predecessor_name) # Generate the final DAX XML file. dax.writeXML(open(daxfile, "w"))
def write(self, filename, name='dax'): """Generate Pegasus abstract workflow (DAX). Parameters ---------- filename : `str` File to write the DAX to. name : `str`, optional Name of the DAX. Returns ------- `Pegasus.ADAG` Abstract workflow used by Pegasus' planner. """ dax = ADAG(name) # Add files to DAX-level replica catalog. catalog = {} for file_id in self.files: attrs = self.graph.node[file_id] f = File(attrs['lfn']) # Add physical file names, if any. urls = attrs.get('urls') if urls is not None: sites = attrs.get('sites') if sites is None: sites = ','.join(len(urls) * ['local']) for url, site in zip(urls.split(','), sites.split(',')): f.addPFN(PFN(url, site)) catalog[attrs['lfn']] = f dax.addFile(f) # Add jobs to the DAX. for task_id in self.tasks: attrs = self.graph.node[task_id] job = Job(name=attrs['name'], id=task_id) # Add job command line arguments replacing any file name with # respective Pegasus file object. args = attrs.get('args') if args is not None and args: args = args.split() lfns = list(set(catalog) & set(args)) if lfns: indices = [args.index(lfn) for lfn in lfns] for idx, lfn in zip(indices, lfns): args[idx] = catalog[lfn] job.addArguments(*args) # Specify job's inputs. inputs = [file_id for file_id in self.graph.predecessors(task_id)] for file_id in inputs: attrs = self.graph.node[file_id] f = catalog[attrs['lfn']] job.uses(f, link=Link.INPUT) # Specify job's outputs outputs = [file_id for file_id in self.graph.successors(task_id)] for file_id in outputs: attrs = self.graph.node[file_id] f = catalog[attrs['lfn']] job.uses(f, link=Link.OUTPUT) streams = attrs.get('streams') if streams is not None: if streams & 1 != 0: job.setStdout(f) if streams & 2 != 0: job.setStderr(f) dax.addJob(job) # Add job dependencies to the DAX. for task_id in self.tasks: parents = set() for file_id in self.graph.predecessors(task_id): parents.update(self.graph.predecessors(file_id)) for parent_id in parents: dax.depends(parent=dax.getJob(parent_id), child=dax.getJob(task_id)) # Finally, write down the workflow in DAX format. with open(filename, 'w') as f: dax.writeXML(f)
Profile(namespace="pegasus", key="clusters.size", value=config.get('all', 'clusters_size'))) sleep.addProfile( Profile(namespace="pegasus", key="clusters.maxruntime", value=config.get('all', 'clusters_maxruntime'))) cluster.addExecutable(sleep) for i in range(4): job = Job(namespace="cluster", name="level1", version="1.0") job.addArguments('-a level1 -T ' + str(i + 1)) job.addArguments('-i', a) job.addProfile( Profile(namespace="pegasus", key="job.runtime", value=str(i + 1))) job.uses(a, link=Link.INPUT) cluster.addJob(job) for j in range(4): child = Job(namespace="cluster", name="level2", version="1.0") child.addArguments('-a level2 -T ' + str((j + 1) * 2)) child.addProfile( Profile(namespace="pegasus", key="runtime", value=str( (j + 1) * 2))) cluster.addJob(child) cluster.depends(parent=job, child=child) # Write the DAX to standard out cluster.writeXML(sys.stdout)
# Add input file to the DAX-level replica catalog a = File("f.a") a.addPFN(PFN(config.get('all', 'file_url') + input_file + "/f.a", config.get('all', 'file_site'))) cluster.addFile(a) for i in range (1, 3): sleep = Executable (namespace = "cluster", name = "level" + str (i), version = "1.0", os = "linux", arch = "x86", installed=config.getboolean('all', 'executable_installed')) sleep.addPFN (PFN (config.get('all', 'executable_url') + sys.argv[1] + "/bin/pegasus-keg", config.get('all', 'executable_site'))) sleep.addProfile (Profile (namespace = "pegasus", key = "clusters.size", value = config.get('all', 'clusters_size'))) sleep.addProfile (Profile (namespace = "pegasus", key = "clusters.maxruntime", value = config.get('all', 'clusters_maxruntime'))) cluster.addExecutable(sleep) for i in range (4): job = Job (namespace = "cluster", name = "level1", version = "1.0") job.addArguments('-a level1 -T ' + str (i + 1)) job.addArguments('-i', a) job.addProfile (Profile (namespace = "pegasus", key = "job.runtime", value = str (i + 1))) job.uses(a, link=Link.INPUT) cluster.addJob (job) for j in range (4): child = Job (namespace = "cluster", name = "level2", version = "1.0") child.addArguments('-a level2 -T ' + str ((j + 1) * 2)) child.addProfile (Profile (namespace = "pegasus", key = "runtime", value = str ((j + 1) * 2))) cluster.addJob (child) cluster.depends (parent = job, child = child) # Write the DAX to standard out cluster.writeXML (sys.stdout)
def write_dax(self, filename='workflow.dax', name='workflow'): """Generate Pegasus abstract workflow (DAX). Parameters ---------- filename : `str` File to write the DAX to. name : `str`, optional Name of the DAX. Returns ------- `Pegasus.ADAG` Abstract workflow used by Pegasus' planner. Raises ------ `ValueError` If either task or file node is missing mandatory attribute. """ dax = ADAG(name) # Process file nodes. for file_id in self.files: attrs = self.graph.node[file_id] try: name = attrs['lfn'] except KeyError: msg = 'Mandatory attribute "%s" is missing.' raise AttributeError(msg.format('lfn')) file_ = File(name) # Add physical file names, if any. urls = attrs.get('pfn') if urls is not None: urls = urls.split(',') sites = attrs.get('sites') if sites is None: sites = len(urls) * ['condorpool'] for url, site in zip(urls, sites): file_.addPFN(PFN(url, site)) self.catalog[attrs['lfn']] = file_ # Add jobs to the DAX. for task_id in self.tasks: attrs = self.graph.node[task_id] try: name = attrs['exec_name'] except KeyError: msg = 'Mandatory attribute "%s" is missing.' raise AttributeError(msg.format('exec_name')) label = '{name}_{id}'.format(name=name, id=task_id) job = Job(name, id=task_id, node_label=label) # Add job command line arguments replacing any file name with # respective Pegasus file object. args = attrs.get('exec_args', []) if args: args = args.split() lfns = list(set(self.catalog) & set(args)) if lfns: indices = [args.index(lfn) for lfn in lfns] for idx, lfn in zip(indices, lfns): args[idx] = self.catalog[lfn] job.addArguments(*args) # Specify job's inputs. inputs = [file_id for file_id in self.graph.predecessors(task_id)] for file_id in inputs: attrs = self.graph.node[file_id] is_ignored = attrs.get('ignore', False) if not is_ignored: file_ = self.catalog[attrs['lfn']] job.uses(file_, link=Link.INPUT) # Specify job's outputs outputs = [file_id for file_id in self.graph.successors(task_id)] for file_id in outputs: attrs = self.graph.node[file_id] is_ignored = attrs.get('ignore', False) if not is_ignored: file_ = self.catalog[attrs['lfn']] job.uses(file_, link=Link.OUTPUT) streams = attrs.get('streams') if streams is not None: if streams & 1 != 0: job.setStdout(file_) if streams & 2 != 0: job.setStderr(file_) # Provide default files to store stderr and stdout, if not # specified explicitly. if job.stderr is None: file_ = File('{name}.out'.format(name=label)) job.uses(file_, link=Link.OUTPUT) job.setStderr(file_) if job.stdout is None: file_ = File('{name}.err'.format(name=label)) job.uses(file_, link=Link.OUTPUT) job.setStdout(file_) dax.addJob(job) # Add job dependencies to the DAX. for task_id in self.tasks: parents = set() for file_id in self.graph.predecessors(task_id): parents.update(self.graph.predecessors(file_id)) for parent_id in parents: dax.depends(parent=dax.getJob(parent_id), child=dax.getJob(task_id)) # Finally, write down the workflow in DAX format. with open(filename, 'w') as f: dax.writeXML(f)
def generate_workflow(self): "Generate a workflow (DAX, config files, and replica catalog)" ts = datetime.utcnow().strftime('%Y%m%dT%H%M%SZ') dax = ADAG("mgrast-prod-%s" % ts) # These are all the global input files for the workflow metagenome = File(self.mgfile) self.add_replica(self.mgfile, os.path.abspath(self.mgfile)) # QC job qcJob = Job("wrapper-qc", node_label="wrapper-qc") qcJob.addArguments("-input", self.mgfile) qcJob.addArguments("-format", self.file_format) qcJob.addArguments("-out_prefix", "075") qcJob.addArguments("-assembled", self.assembled) qcJob.addArguments("-filter_options", self.filter_options) qcJob.addArguments("-proc", "8") qcJob.uses(metagenome, link=Link.INPUT) qcJob.uses("075.assembly.coverage", link=Link.OUTPUT, transfer=False) qcJob.uses("075.qc.stats", link=Link.OUTPUT, transfer=False) qcJob.uses("075.upload.stats", link=Link.OUTPUT, transfer=False) qcJob.profile("globus", "maxwalltime", "60") qcJob.profile("globus", "hostcount", "8") qcJob.profile("globus", "count", "8") dax.addJob(qcJob) # Preprocess Job preprocessJob = Job("wrapper-preprocess", node_label="wrapper-preprocess") preprocessJob.addArguments("-input", self.mgfile) preprocessJob.addArguments("-format", self.file_format) preprocessJob.addArguments("-out_prefix", "100.preprocess") preprocessJob.addArguments("-filter_options", self.filter_options) preprocessJob.uses(metagenome, link=Link.INPUT) preprocessJob.uses("100.preprocess.passed.fna", link=Link.OUTPUT, transfer=False) preprocessJob.uses("100.preprocess.removed.fna", link=Link.OUTPUT, transfer=False) preprocessJob.profile("globus", "maxwalltime", "20") dax.addJob(preprocessJob) # Dereplicate Job dereplicateJob = Job("wrapper-dereplicate", node_label="wrapper-dereplicate") dereplicateJob.addArguments("-input=100.preprocess.passed.fna") dereplicateJob.addArguments("-out_prefix=150.dereplication") dereplicateJob.addArguments("-prefix_length=%s" % self.prefix_length) dereplicateJob.addArguments("-dereplicate=%s" % self.dereplicate) dereplicateJob.addArguments("-memory=10") dereplicateJob.uses("100.preprocess.passed.fna", link=Link.INPUT) dereplicateJob.uses("150.dereplication.passed.fna", link=Link.OUTPUT, transfer=False) dereplicateJob.uses("150.dereplication.removed.fna", link=Link.OUTPUT, transfer=False) dereplicateJob.profile("globus", "maxwalltime", "10") dax.addJob(dereplicateJob) dax.depends(dereplicateJob, preprocessJob) # Bowtie Screen Job bowtieJob = Job("wrapper-bowtie-screen", node_label="wrapper-bowtie-screen") bowtieJob.addArguments("-input=150.dereplication.passed.fna") bowtieJob.addArguments("-output=299.screen.passed.fna") bowtieJob.addArguments("-index=%s" % self.screen_indexes) bowtieJob.addArguments("-bowtie=%s" % self.bowtie) bowtieJob.addArguments("-proc=8") bowtieJob.uses("150.dereplication.passed.fna", link=Link.INPUT) bowtieJob.uses("299.screen.passed.fna", link=Link.OUTPUT, transfer=False) bowtieJob.profile("globus", "maxwalltime", "30") bowtieJob.profile("globus", "hostcount", "8") bowtieJob.profile("globus", "count", "8") dax.addJob(bowtieJob) dax.depends(bowtieJob, dereplicateJob) # Genecalling Job geneJob = Job("wrapper-genecalling", node_label="wrapper-genecalling") geneJob.addArguments("-input=299.screen.passed.fna") geneJob.addArguments("-out_prefix=350.genecalling.coding") geneJob.addArguments("-type=%s" % self.fgs_type) geneJob.addArguments("-size=100") geneJob.addArguments("-proc=8") geneJob.uses("299.screen.passed.fna", link=Link.INPUT) geneJob.uses("350.genecalling.coding.faa", link=Link.OUTPUT, transfer=False) geneJob.uses("350.genecalling.coding.fna", link=Link.OUTPUT, transfer=False) geneJob.profile("globus", "maxwalltime", "30") geneJob.profile("globus", "hostcount", "8") geneJob.profile("globus", "count", "8") dax.addJob(geneJob) dax.depends(geneJob, bowtieJob) # Cluster (Genecalling) Job cluster1Job = Job("wrapper-cluster", node_label="wrapper-cluster") cluster1Job.addArguments("-input=350.genecalling.coding.faa") cluster1Job.addArguments("-out_prefix=550.cluster") cluster1Job.addArguments("-aa") cluster1Job.addArguments("-pid=%s" % self.aa_pid) cluster1Job.addArguments("-memory=20") cluster1Job.uses("350.genecalling.coding.faa", link=Link.INPUT) cluster1Job.uses("550.cluster.aa%s.faa" % self.aa_pid, link=Link.OUTPUT, transfer=False) cluster1Job.uses("550.cluster.aa%s.mapping" % self.aa_pid, link=Link.OUTPUT, transfer=False) cluster1Job.profile("globus", "maxwalltime", "10") dax.addJob(cluster1Job) dax.depends(cluster1Job, geneJob) # Blat_prot Job blatprotJob = Job("wrapper-blat-prot", node_label="wrapper-blat-prot") blatprotJob.addArguments("--input=550.cluster.aa%s.faa" % self.aa_pid) blatprotJob.addArguments("--output=650.superblat.sims") blatprotJob.uses("550.cluster.aa%s.faa" % self.aa_pid, link=Link.INPUT) blatprotJob.uses("650.superblat.sims", link=Link.OUTPUT, transfer=False) blatprotJob.profile("globus", "maxwalltime", "2880") blatprotJob.profile("globus", "hostcount", "24") blatprotJob.profile("globus", "count", "24") dax.addJob(blatprotJob) dax.depends(blatprotJob, cluster1Job) # Annotate Sims (Blat Prod) Job annotatesims1Job = Job("wrapper-annotate-sims", node_label="wrapper-annotate-sims") annotatesims1Job.addArguments("-input=650.superblat.sims") annotatesims1Job.addArguments("-out_prefix=650") annotatesims1Job.addArguments("-aa") annotatesims1Job.addArguments("-ach_ver=%s" % self.ach_annotation_ver) annotatesims1Job.addArguments("-ann_file=m5nr_v1.bdb") annotatesims1Job.uses("650.superblat.sims", link=Link.INPUT) annotatesims1Job.uses("650.aa.sims.filter", link=Link.OUTPUT, transfer=False) annotatesims1Job.uses("650.aa.expand.protein", link=Link.OUTPUT, transfer=False) annotatesims1Job.uses("650.aa.expand.lca", link=Link.OUTPUT, transfer=False) annotatesims1Job.uses("650.aa.expand.ontology", link=Link.OUTPUT, transfer=False) annotatesims1Job.profile("globus", "maxwalltime", "720") dax.addJob(annotatesims1Job) dax.depends(annotatesims1Job, blatprotJob) # Search RNA Job searchJob = Job("wrapper-search-rna", node_label="wrapper-search-rna") searchJob.addArguments("-input=100.preprocess.passed.fna") searchJob.addArguments("-output=425.search.rna.fna") searchJob.addArguments("-rna_nr=%s" % self.m5rna_clust) searchJob.addArguments("-size=100") searchJob.addArguments("-proc=8") searchJob.uses("100.preprocess.passed.fna", link=Link.INPUT) searchJob.uses("425.search.rna.fna", link=Link.OUTPUT, transfer=False) searchJob.profile("globus", "maxwalltime", "120") searchJob.profile("globus", "hostcount", "8") searchJob.profile("globus", "count", "8") dax.addJob(searchJob) dax.depends(searchJob, preprocessJob) # CLuster (Search RNA) Job cluster2Job = Job("wrapper-cluster", node_label="wrapper-cluster") cluster2Job.addArguments("-input=425.search.rna.fna") cluster2Job.addArguments("-out_prefix=440.cluster") cluster2Job.addArguments("-rna") cluster2Job.addArguments("-pid=%s" % self.rna_pid) cluster2Job.addArguments("-memory=20") cluster2Job.uses("425.search.rna.fna", link=Link.INPUT) cluster2Job.uses("440.cluster.rna%s.fna" % self.rna_pid, link=Link.OUTPUT, transfer=False) cluster2Job.uses("440.cluster.rna%s.mapping" % self.rna_pid, link=Link.OUTPUT, transfer=False) cluster2Job.profile("globus", "maxwalltime", "30") dax.addJob(cluster2Job) dax.depends(cluster2Job, searchJob) # Blat_rna Job blatrnaJob = Job("wrapper-blat-rna", node_label="wrapper-blat-rna") blatrnaJob.addArguments("--input=440.cluster.rna%s.fna" % self.rna_pid) blatrnaJob.addArguments("-rna_nr=m5rna") blatrnaJob.addArguments("--output=450.rna.sims") blatrnaJob.addArguments("-assembled=%s" % self.assembled) blatrnaJob.uses("440.cluster.rna%s.fna" % self.rna_pid, link=Link.INPUT) blatrnaJob.uses("450.rna.sims", link=Link.OUTPUT, transfer=False) blatrnaJob.profile("globus", "maxwalltime", "20") dax.addJob(blatrnaJob) dax.depends(blatrnaJob, cluster2Job) # Annotate Sims (Blat RNA) Job annotatesims2Job = Job("wrapper-annotate-sims", node_label="wrapper-annotate-sims") annotatesims2Job.addArguments("-input=450.rna.sims") annotatesims2Job.addArguments("-out_prefix=450") annotatesims2Job.addArguments("-rna") annotatesims2Job.addArguments("-ach_ver=%s" % self.ach_annotation_ver) annotatesims2Job.addArguments("-ann_file=m5nr_v1.bdb") annotatesims2Job.uses("450.rna.sims", link=Link.INPUT) annotatesims2Job.uses("450.rna.sims.filter", link=Link.OUTPUT, transfer=False) annotatesims2Job.uses("450.rna.expand.rna", link=Link.OUTPUT, transfer=False) annotatesims2Job.uses("450.rna.expand.lca", link=Link.OUTPUT, transfer=False) annotatesims2Job.profile("globus", "maxwalltime", "30") dax.addJob(annotatesims2Job) dax.depends(annotatesims2Job, blatrnaJob) # Index Sim Seq Job indexJob = Job("wrapper-index", node_label="wrapper-index") indexJob.addArguments("-in_seqs=350.genecalling.coding.fna") indexJob.addArguments("-in_seqs=425.search.rna.fna") indexJob.addArguments("-in_maps=550.cluster.aa%s.mapping" % self.aa_pid) indexJob.addArguments("-in_maps=440.cluster.rna%s.mapping" % self.rna_pid) indexJob.addArguments("-in_sims=650.aa.sims.filter") indexJob.addArguments("-in_sims=450.rna.sims.filter") indexJob.addArguments("-output=700.annotation.sims.filter.seq") indexJob.addArguments("-ach_ver=%s" % self.ach_annotation_ver) indexJob.addArguments("-memory=10") indexJob.addArguments("-ann_file=m5nr_v1.bdb") indexJob.uses("350.genecalling.coding.fna", link=Link.INPUT) indexJob.uses("550.cluster.aa%s.mapping" % self.aa_pid, link=Link.INPUT) indexJob.uses("650.aa.sims.filter", link=Link.INPUT) indexJob.uses("425.search.rna.fna", link=Link.INPUT) indexJob.uses("440.cluster.rna%s.mapping" % self.rna_pid, link=Link.INPUT) indexJob.uses("450.rna.sims.filter", link=Link.INPUT) indexJob.uses("700.annotation.sims.filter.seq", link=Link.OUTPUT, transfer=False) indexJob.uses("700.annotation.sims.filter.seq.index", link=Link.OUTPUT, transfer=False) indexJob.profile("globus", "maxwalltime", "120") dax.addJob(indexJob) dax.depends(indexJob, geneJob) dax.depends(indexJob, cluster1Job) dax.depends(indexJob, cluster2Job) dax.depends(indexJob, searchJob) dax.depends(indexJob, annotatesims1Job) # Annotate Summary Job (13) summary13Job = Job("wrapper-summary", node_label="wrapper-summary") summary13Job.addArguments("-job=1") summary13Job.addArguments("-in_expand=650.aa.expand.protein") summary13Job.addArguments("-in_expand=450.rna.expand.rna") summary13Job.addArguments("-in_maps=550.cluster.aa%s.mapping" % self.aa_pid) summary13Job.addArguments("-in_maps=440.cluster.rna%s.mapping" % self.rna_pid) summary13Job.addArguments("-in_assemb=075.assembly.coverage") summary13Job.addArguments("-in_index=700.annotation.sims.filter.seq.index") summary13Job.addArguments("-output=700.annotation.md5.summary") summary13Job.addArguments("-nr_ver=%s" % self.ach_annotation_ver) summary13Job.addArguments("-type=md5") summary13Job.uses("075.assembly.coverage", link=Link.INPUT) summary13Job.uses("550.cluster.aa%s.mapping" % self.aa_pid, link=Link.INPUT) summary13Job.uses("650.aa.expand.protein", link=Link.INPUT) summary13Job.uses("440.cluster.rna%s.mapping" % self.rna_pid, link=Link.INPUT) summary13Job.uses("450.rna.expand.rna", link=Link.INPUT) summary13Job.uses("700.annotation.sims.filter.seq.index", link=Link.INPUT) summary13Job.uses("700.annotation.md5.summary", link=Link.OUTPUT, transfer=True) summary13Job.profile("globus", "maxwalltime", "30") dax.addJob(summary13Job) dax.depends(summary13Job, qcJob) dax.depends(summary13Job, cluster1Job) dax.depends(summary13Job, cluster2Job) dax.depends(summary13Job, indexJob) dax.depends(summary13Job, annotatesims1Job) dax.depends(summary13Job, annotatesims2Job) # Annotate Summary Job (14) summary14Job = Job("wrapper-summary", node_label="wrapper-summary") summary14Job.addArguments("-job=1") summary14Job.addArguments("-in_expand=650.aa.expand.protein") summary14Job.addArguments("-in_expand=450.rna.expand.rna") summary14Job.addArguments("-in_maps=550.cluster.aa%s.mapping" % self.aa_pid) summary14Job.addArguments("-in_maps=440.cluster.rna%s.mapping" % self.rna_pid) summary14Job.addArguments("-in_assemb=075.assembly.coverage") summary14Job.addArguments("-output=700.annotation.function.summary") summary14Job.addArguments("-nr_ver=%s" % self.ach_annotation_ver) summary14Job.addArguments("-type=function") summary14Job.uses("075.assembly.coverage", link=Link.INPUT) summary14Job.uses("550.cluster.aa%s.mapping" % self.aa_pid, link=Link.INPUT) summary14Job.uses("650.aa.expand.protein", link=Link.INPUT) summary14Job.uses("440.cluster.rna%s.mapping" % self.rna_pid, link=Link.INPUT) summary14Job.uses("450.rna.expand.rna", link=Link.INPUT) summary14Job.uses("700.annotation.function.summary", link=Link.OUTPUT, transfer=True) summary14Job.profile("globus", "maxwalltime", "30") dax.addJob(summary14Job) dax.depends(summary14Job, qcJob) dax.depends(summary14Job, cluster1Job) dax.depends(summary14Job, cluster2Job) dax.depends(summary14Job, annotatesims1Job) dax.depends(summary14Job, annotatesims2Job) # Annotate Summary Job (15) summary15Job = Job("wrapper-summary", node_label="wrapper-summary") summary15Job.addArguments("-job=1") summary15Job.addArguments("-in_expand=650.aa.expand.protein") summary15Job.addArguments("-in_expand=450.rna.expand.rna") summary15Job.addArguments("-in_maps=550.cluster.aa%s.mapping" % self.aa_pid) summary15Job.addArguments("-in_maps=440.cluster.rna%s.mapping" % self.rna_pid) summary15Job.addArguments("-in_assemb=075.assembly.coverage") summary15Job.addArguments("-output=700.annotation.organism.summary") summary15Job.addArguments("-nr_ver=%s" % self.ach_annotation_ver) summary15Job.addArguments("-type=organism") summary15Job.uses("075.assembly.coverage", link=Link.INPUT) summary15Job.uses("550.cluster.aa%s.mapping" % self.aa_pid, link=Link.INPUT) summary15Job.uses("650.aa.expand.protein", link=Link.INPUT) summary15Job.uses("440.cluster.rna%s.mapping" % self.rna_pid, link=Link.INPUT) summary15Job.uses("450.rna.expand.rna", link=Link.INPUT) summary15Job.uses("700.annotation.organism.summary", link=Link.OUTPUT, transfer=True) summary15Job.profile("globus", "maxwalltime", "30") dax.addJob(summary15Job) dax.depends(summary15Job, qcJob) dax.depends(summary15Job, cluster1Job) dax.depends(summary15Job, cluster2Job) dax.depends(summary15Job, annotatesims1Job) dax.depends(summary15Job, annotatesims2Job) # Annotate Summary Job (16) summary16Job = Job("wrapper-summary", node_label="wrapper-summary") summary16Job.addArguments("-job=1") summary16Job.addArguments("-in_expand=650.aa.expand.lca") summary16Job.addArguments("-in_expand=450.rna.expand.lca") summary16Job.addArguments("-in_maps=550.cluster.aa%s.mapping" % self.aa_pid) summary16Job.addArguments("-in_maps=440.cluster.rna%s.mapping" % self.rna_pid) summary16Job.addArguments("-in_assemb=075.assembly.coverage") summary16Job.addArguments("-output=700.annotation.lca.summary") summary16Job.addArguments("-nr_ver=%s" % self.ach_annotation_ver) summary16Job.addArguments("-type=lca") summary16Job.uses("075.assembly.coverage", link=Link.INPUT) summary16Job.uses("550.cluster.aa%s.mapping" % self.aa_pid, link=Link.INPUT) summary16Job.uses("650.aa.expand.lca", link=Link.INPUT) summary16Job.uses("440.cluster.rna%s.mapping" % self.rna_pid, link=Link.INPUT) summary16Job.uses("450.rna.expand.lca", link=Link.INPUT) summary16Job.uses("700.annotation.lca.summary", link=Link.OUTPUT, transfer=True) summary16Job.profile("globus", "maxwalltime", "30") dax.addJob(summary16Job) dax.depends(summary16Job, qcJob) dax.depends(summary16Job, cluster1Job) dax.depends(summary16Job, cluster2Job) dax.depends(summary16Job, annotatesims1Job) dax.depends(summary16Job, annotatesims2Job) # Annotate Summary Job (17) summary17Job = Job("wrapper-summary", node_label="wrapper-summary") summary17Job.addArguments("-job=1") summary17Job.addArguments("-in_expand=650.aa.expand.ontology") summary17Job.addArguments("-in_maps=550.cluster.aa%s.mapping" % self.aa_pid) summary17Job.addArguments("-in_assemb=075.assembly.coverage") summary17Job.addArguments("-output=700.annotation.ontology.summary") summary17Job.addArguments("-nr_ver=%s" % self.ach_annotation_ver) summary17Job.addArguments("-type=ontology") summary17Job.uses("075.assembly.coverage", link=Link.INPUT) summary17Job.uses("550.cluster.aa%s.mapping" % self.aa_pid, link=Link.INPUT) summary17Job.uses("650.aa.expand.ontology", link=Link.INPUT) summary17Job.uses("700.annotation.ontology.summary", link=Link.OUTPUT, transfer=True) summary17Job.profile("globus", "maxwalltime", "30") dax.addJob(summary17Job) dax.depends(summary17Job, qcJob) dax.depends(summary17Job, cluster1Job) dax.depends(summary17Job, annotatesims1Job) # Annotate Summary Job (18) summary18Job = Job("wrapper-summary", node_label="wrapper-summary") summary18Job.addArguments("-job=1") summary18Job.addArguments("-in_expand=650.aa.expand.protein") summary18Job.addArguments("-in_expand=450.rna.expand.rna") summary18Job.addArguments("-in_maps=550.cluster.aa%s.mapping" % self.aa_pid) summary18Job.addArguments("-in_maps=440.cluster.rna%s.mapping" % self.rna_pid) summary18Job.addArguments("-in_assemb=075.assembly.coverage") summary18Job.addArguments("-output=700.annotation.source.stats") summary18Job.addArguments("-nr_ver=%s" % self.ach_annotation_ver) summary18Job.addArguments("-type=source") summary18Job.uses("075.assembly.coverage", link=Link.INPUT) summary18Job.uses("550.cluster.aa%s.mapping" % self.aa_pid, link=Link.INPUT) summary18Job.uses("650.aa.expand.protein", link=Link.INPUT) summary18Job.uses("440.cluster.rna%s.mapping" % self.rna_pid, link=Link.INPUT) summary18Job.uses("450.rna.expand.rna", link=Link.INPUT) summary18Job.uses("700.annotation.source.stats", link=Link.OUTPUT, transfer=True) summary18Job.profile("globus", "maxwalltime", "30") dax.addJob(summary18Job) dax.depends(summary18Job, qcJob) dax.depends(summary18Job, cluster1Job) dax.depends(summary18Job, cluster2Job) dax.depends(summary18Job, annotatesims1Job) dax.depends(summary18Job, annotatesims2Job) # Write the DAX file dax.writeXMLFile(self.daxfile) # Generate the replica catalog self.generate_replica_catalog()
def generate_dax(self): "Generate a workflow (DAX, config files, and replica catalog)" ts = datetime.utcnow().strftime('%Y%m%dT%H%M%SZ') dax = ADAG("refinement-%s" % ts) # These are all the global input files for the workflow coordinates = File(self.coordinates) parameters = File(self.parameters) extended_system = File(self.extended_system) topfile = File(self.topfile) sassena_db = File(self.sassena_db) incoherent_db = File(self.incoherent_db) coherent_db = File(self.coherent_db) # This job untars the sassena db and makes it available to the other # jobs in the workflow untarjob = Job("tar", node_label="untar") if self.is_synthetic_workflow: untarjob.addArguments("-p", "-xzvf", sassena_db.name) untarjob.addArguments("-a", "tar") for output_file in [ "incoherent_db", "coherent_db" ]: untarjob.addArguments(self.keg_params.output_file("tar", output_file, eval(output_file).name)) self.keg_params.add_keg_params(untarjob) else: untarjob.addArguments("-xzvf", sassena_db) untarjob.uses(sassena_db, link=Link.INPUT) untarjob.uses(incoherent_db, link=Link.OUTPUT, transfer=False) untarjob.uses(coherent_db, link=Link.OUTPUT, transfer=False) untarjob.profile("globus", "jobtype", "single") untarjob.profile("globus", "maxwalltime", "1") untarjob.profile("globus", "count", "1") dax.addJob(untarjob) # For each charge that was listed in the config file for charge in self.charges: structure = "Q%s.psf" % charge # Equilibrate files eq_conf = File("equilibrate_%s.conf" % charge) eq_coord = File("equilibrate_%s.restart.coord" % charge) eq_xsc = File("equilibrate_%s.restart.xsc" % charge) eq_vel = File("equilibrate_%s.restart.vel" % charge) # Production files prod_conf = File("production_%s.conf" % charge) prod_dcd = File("production_%s.dcd" % charge) # Ptraj files ptraj_conf = File("ptraj_%s.conf" % charge) ptraj_fit = File("ptraj_%s.fit" % charge) ptraj_dcd = File("ptraj_%s.dcd" % charge) # Sassena incoherent files incoherent_conf = File("sassenaInc_%s.xml" % charge) fqt_incoherent = File("fqt_inc_%s.hd5" % charge) # Sassena coherent files coherent_conf = File("sassenaCoh_%s.xml" % charge) fqt_coherent = File("fqt_coh_%s.hd5" % charge) # Generate psf and configuration files for this charge pipeline self.generate_psf(charge) self.generate_eq_conf(charge, structure) self.generate_prod_conf(charge, structure) self.generate_ptraj_conf(charge) self.generate_incoherent_conf(charge) self.generate_coherent_conf(charge) # Equilibrate job eqjob = Job("namd", node_label="namd_eq_%s" % charge) if self.is_synthetic_workflow: eqjob.addArguments("-p", eq_conf) eqjob.addArguments("-a", "namd_eq_%s" % charge) eqjob.addArguments("-i", eq_conf.name, structure, coordinates.name, parameters.name, extended_system.name) task_label = "namd-eq" for output_file in [ "eq_coord", "eq_xsc", "eq_vel" ]: eqjob.addArguments(self.keg_params.output_file(task_label, output_file, eval(output_file).name)) self.keg_params.add_keg_params(eqjob, task_label) else: eqjob.addArguments(eq_conf) eqjob.uses(eq_conf, link=Link.INPUT) eqjob.uses(structure, link=Link.INPUT) eqjob.uses(coordinates, link=Link.INPUT) eqjob.uses(parameters, link=Link.INPUT) eqjob.uses(extended_system, link=Link.INPUT) eqjob.uses(eq_coord, link=Link.OUTPUT, transfer=False) eqjob.uses(eq_xsc, link=Link.OUTPUT, transfer=False) eqjob.uses(eq_vel, link=Link.OUTPUT, transfer=False) if self.is_synthetic_workflow: eqjob.profile("globus", "jobtype", "mpi") eqjob.profile("globus", "maxwalltime", "1") eqjob.profile("globus", "count", "8") else: eqjob.profile("globus", "jobtype", "mpi") eqjob.profile("globus", "maxwalltime", self.getconf("equilibrate_maxwalltime")) eqjob.profile("globus", "count", self.getconf("equilibrate_cores")) dax.addJob(eqjob) # Production job prodjob = Job("namd", node_label="namd_prod_%s" % charge) if self.is_synthetic_workflow: prodjob.addArguments("-p", prod_conf) prodjob.addArguments("-a", "namd_prod_%s" % charge) prodjob.addArguments("-i", prod_conf.name, structure, coordinates.name, parameters.name, eq_coord.name, eq_xsc.name, eq_vel.name) task_label = "namd-prod" prodjob.addArguments(self.keg_params.output_file(task_label, "prod_dcd", prod_dcd.name)) self.keg_params.add_keg_params(prodjob, task_label) else: prodjob.addArguments(prod_conf) prodjob.uses(prod_conf, link=Link.INPUT) prodjob.uses(structure, link=Link.INPUT) prodjob.uses(coordinates, link=Link.INPUT) prodjob.uses(parameters, link=Link.INPUT) prodjob.uses(eq_coord, link=Link.INPUT) prodjob.uses(eq_xsc, link=Link.INPUT) prodjob.uses(eq_vel, link=Link.INPUT) prodjob.uses(prod_dcd, link=Link.OUTPUT, transfer=True) if self.is_synthetic_workflow: prodjob.profile("globus", "jobtype", "mpi") prodjob.profile("globus", "maxwalltime", "6") prodjob.profile("globus", "count", "8") else: prodjob.profile("globus", "jobtype", "mpi") prodjob.profile("globus", "maxwalltime", self.getconf("production_maxwalltime")) prodjob.profile("globus", "count", self.getconf("production_cores")) dax.addJob(prodjob) dax.depends(prodjob, eqjob) # ptraj job ptrajjob = Job(namespace="amber", name="ptraj", node_label="amber_ptraj_%s" % charge) if self.is_synthetic_workflow: ptrajjob.addArguments("-p", topfile) ptrajjob.addArguments("-a", "amber_ptraj_%s" % charge) ptrajjob.addArguments("-i", topfile.name, ptraj_conf.name, prod_dcd.name) task_label = "amber-ptraj" for output_file in [ "ptraj_fit", "ptraj_dcd" ]: ptrajjob.addArguments(self.keg_params.output_file(task_label, output_file, eval(output_file).name)) self.keg_params.add_keg_params(ptrajjob, task_label) else: ptrajjob.addArguments(topfile) ptrajjob.setStdin(ptraj_conf) ptrajjob.uses(topfile, link=Link.INPUT) ptrajjob.uses(ptraj_conf, link=Link.INPUT) ptrajjob.uses(prod_dcd, link=Link.INPUT) ptrajjob.uses(ptraj_fit, link=Link.OUTPUT, transfer=True) ptrajjob.uses(ptraj_dcd, link=Link.OUTPUT, transfer=True) ptrajjob.profile("globus", "jobtype", "single") ptrajjob.profile("globus", "maxwalltime", self.getconf("ptraj_maxwalltime")) ptrajjob.profile("globus", "count", self.getconf("ptraj_cores")) dax.addJob(ptrajjob) dax.depends(ptrajjob, prodjob) # sassena incoherent job incojob = Job("sassena", node_label="sassena_inc_%s" % charge) if self.is_synthetic_workflow: incojob.addArguments("-p", "--config", incoherent_conf) incojob.addArguments("-a", "sassena_inc_%s" % charge) incojob.addArguments("-i", incoherent_conf.name, ptraj_dcd.name, incoherent_db.name, coordinates.name) task_label = "sassena-inc" incojob.addArguments(self.keg_params.output_file(task_label, "fqt_incoherent", fqt_incoherent.name)) self.keg_params.add_keg_params(incojob, task_label) else: incojob.addArguments("--config", incoherent_conf) incojob.uses(incoherent_conf, link=Link.INPUT) incojob.uses(ptraj_dcd, link=Link.INPUT) incojob.uses(incoherent_db, link=Link.INPUT) incojob.uses(coordinates, link=Link.INPUT) incojob.uses(fqt_incoherent, link=Link.OUTPUT, transfer=True) if self.is_synthetic_workflow: incojob.profile("globus", "jobtype", "mpi") incojob.profile("globus", "maxwalltime", "6") incojob.profile("globus", "count", "8") else: incojob.profile("globus", "jobtype", "mpi") incojob.profile("globus", "maxwalltime", self.getconf("sassena_maxwalltime")) incojob.profile("globus", "count", self.getconf("sassena_cores")) dax.addJob(incojob) dax.depends(incojob, ptrajjob) dax.depends(incojob, untarjob) # sassena coherent job cojob = Job("sassena", node_label="sassena_coh_%s" % charge) if self.is_synthetic_workflow: cojob.addArguments("-p", "--config", coherent_conf) cojob.addArguments("-a", "sassena_coh_%s" % charge) cojob.addArguments("-i", coherent_conf.name, ptraj_dcd.name, coherent_db.name, coordinates.name) task_label = "sassena-coh" cojob.addArguments(self.keg_params.output_file(task_label, "fqt_coherent", fqt_coherent.name)) self.keg_params.add_keg_params(cojob, task_label) else: cojob.addArguments("--config", coherent_conf) cojob.uses(coherent_conf, link=Link.INPUT) cojob.uses(ptraj_dcd, link=Link.INPUT) cojob.uses(coherent_db, link=Link.INPUT) cojob.uses(coordinates, link=Link.INPUT) cojob.uses(fqt_coherent, link=Link.OUTPUT, transfer=True) if self.is_synthetic_workflow: cojob.profile("globus", "jobtype", "mpi") cojob.profile("globus", "maxwalltime", "6") cojob.profile("globus", "count", "8") else: cojob.profile("globus", "jobtype", "mpi") cojob.profile("globus", "maxwalltime", self.getconf("sassena_maxwalltime")) cojob.profile("globus", "count", self.getconf("sassena_cores")) dax.addJob(cojob) dax.depends(cojob, prodjob) dax.depends(cojob, untarjob) # Write the DAX file dax.writeXMLFile(self.daxfile)