def generate_workflow(self): "Generate a workflow (DAX, config files, and replica catalog)" ts = datetime.utcnow().strftime('%Y%m%dT%H%M%SZ') dax = ADAG("mgrast-prod-%s" % ts) # These are all the global input files for the workflow metagenome = File(self.mgfile) self.add_replica(self.mgfile, os.path.abspath(self.mgfile)) # QC job qcJob = Job("wrapper-qc", node_label="wrapper-qc") qcJob.addArguments("-input", self.mgfile) qcJob.addArguments("-format", self.file_format) qcJob.addArguments("-out_prefix", "075") qcJob.addArguments("-assembled", self.assembled) qcJob.addArguments("-filter_options", self.filter_options) qcJob.addArguments("-proc", "8") qcJob.uses(metagenome, link=Link.INPUT) qcJob.uses("075.assembly.coverage", link=Link.OUTPUT, transfer=False) qcJob.uses("075.qc.stats", link=Link.OUTPUT, transfer=False) qcJob.uses("075.upload.stats", link=Link.OUTPUT, transfer=False) qcJob.profile("globus", "maxwalltime", "60") qcJob.profile("globus", "hostcount", "8") qcJob.profile("globus", "count", "8") dax.addJob(qcJob) # Preprocess Job preprocessJob = Job("wrapper-preprocess", node_label="wrapper-preprocess") preprocessJob.addArguments("-input", self.mgfile) preprocessJob.addArguments("-format", self.file_format) preprocessJob.addArguments("-out_prefix", "100.preprocess") preprocessJob.addArguments("-filter_options", self.filter_options) preprocessJob.uses(metagenome, link=Link.INPUT) preprocessJob.uses("100.preprocess.passed.fna", link=Link.OUTPUT, transfer=False) preprocessJob.uses("100.preprocess.removed.fna", link=Link.OUTPUT, transfer=False) preprocessJob.profile("globus", "maxwalltime", "20") dax.addJob(preprocessJob) # Dereplicate Job dereplicateJob = Job("wrapper-dereplicate", node_label="wrapper-dereplicate") dereplicateJob.addArguments("-input=100.preprocess.passed.fna") dereplicateJob.addArguments("-out_prefix=150.dereplication") dereplicateJob.addArguments("-prefix_length=%s" % self.prefix_length) dereplicateJob.addArguments("-dereplicate=%s" % self.dereplicate) dereplicateJob.addArguments("-memory=10") dereplicateJob.uses("100.preprocess.passed.fna", link=Link.INPUT) dereplicateJob.uses("150.dereplication.passed.fna", link=Link.OUTPUT, transfer=False) dereplicateJob.uses("150.dereplication.removed.fna", link=Link.OUTPUT, transfer=False) dereplicateJob.profile("globus", "maxwalltime", "10") dax.addJob(dereplicateJob) dax.depends(dereplicateJob, preprocessJob) # Bowtie Screen Job bowtieJob = Job("wrapper-bowtie-screen", node_label="wrapper-bowtie-screen") bowtieJob.addArguments("-input=150.dereplication.passed.fna") bowtieJob.addArguments("-output=299.screen.passed.fna") bowtieJob.addArguments("-index=%s" % self.screen_indexes) bowtieJob.addArguments("-bowtie=%s" % self.bowtie) bowtieJob.addArguments("-proc=8") bowtieJob.uses("150.dereplication.passed.fna", link=Link.INPUT) bowtieJob.uses("299.screen.passed.fna", link=Link.OUTPUT, transfer=False) bowtieJob.profile("globus", "maxwalltime", "30") bowtieJob.profile("globus", "hostcount", "8") bowtieJob.profile("globus", "count", "8") dax.addJob(bowtieJob) dax.depends(bowtieJob, dereplicateJob) # Genecalling Job geneJob = Job("wrapper-genecalling", node_label="wrapper-genecalling") geneJob.addArguments("-input=299.screen.passed.fna") geneJob.addArguments("-out_prefix=350.genecalling.coding") geneJob.addArguments("-type=%s" % self.fgs_type) geneJob.addArguments("-size=100") geneJob.addArguments("-proc=8") geneJob.uses("299.screen.passed.fna", link=Link.INPUT) geneJob.uses("350.genecalling.coding.faa", link=Link.OUTPUT, transfer=False) geneJob.uses("350.genecalling.coding.fna", link=Link.OUTPUT, transfer=False) geneJob.profile("globus", "maxwalltime", "30") geneJob.profile("globus", "hostcount", "8") geneJob.profile("globus", "count", "8") dax.addJob(geneJob) dax.depends(geneJob, bowtieJob) # Cluster (Genecalling) Job cluster1Job = Job("wrapper-cluster", node_label="wrapper-cluster") cluster1Job.addArguments("-input=350.genecalling.coding.faa") cluster1Job.addArguments("-out_prefix=550.cluster") cluster1Job.addArguments("-aa") cluster1Job.addArguments("-pid=%s" % self.aa_pid) cluster1Job.addArguments("-memory=20") cluster1Job.uses("350.genecalling.coding.faa", link=Link.INPUT) cluster1Job.uses("550.cluster.aa%s.faa" % self.aa_pid, link=Link.OUTPUT, transfer=False) cluster1Job.uses("550.cluster.aa%s.mapping" % self.aa_pid, link=Link.OUTPUT, transfer=False) cluster1Job.profile("globus", "maxwalltime", "10") dax.addJob(cluster1Job) dax.depends(cluster1Job, geneJob) # Blat_prot Job blatprotJob = Job("wrapper-blat-prot", node_label="wrapper-blat-prot") blatprotJob.addArguments("--input=550.cluster.aa%s.faa" % self.aa_pid) blatprotJob.addArguments("--output=650.superblat.sims") blatprotJob.uses("550.cluster.aa%s.faa" % self.aa_pid, link=Link.INPUT) blatprotJob.uses("650.superblat.sims", link=Link.OUTPUT, transfer=False) blatprotJob.profile("globus", "maxwalltime", "2880") blatprotJob.profile("globus", "hostcount", "24") blatprotJob.profile("globus", "count", "24") dax.addJob(blatprotJob) dax.depends(blatprotJob, cluster1Job) # Annotate Sims (Blat Prod) Job annotatesims1Job = Job("wrapper-annotate-sims", node_label="wrapper-annotate-sims") annotatesims1Job.addArguments("-input=650.superblat.sims") annotatesims1Job.addArguments("-out_prefix=650") annotatesims1Job.addArguments("-aa") annotatesims1Job.addArguments("-ach_ver=%s" % self.ach_annotation_ver) annotatesims1Job.addArguments("-ann_file=m5nr_v1.bdb") annotatesims1Job.uses("650.superblat.sims", link=Link.INPUT) annotatesims1Job.uses("650.aa.sims.filter", link=Link.OUTPUT, transfer=False) annotatesims1Job.uses("650.aa.expand.protein", link=Link.OUTPUT, transfer=False) annotatesims1Job.uses("650.aa.expand.lca", link=Link.OUTPUT, transfer=False) annotatesims1Job.uses("650.aa.expand.ontology", link=Link.OUTPUT, transfer=False) annotatesims1Job.profile("globus", "maxwalltime", "720") dax.addJob(annotatesims1Job) dax.depends(annotatesims1Job, blatprotJob) # Search RNA Job searchJob = Job("wrapper-search-rna", node_label="wrapper-search-rna") searchJob.addArguments("-input=100.preprocess.passed.fna") searchJob.addArguments("-output=425.search.rna.fna") searchJob.addArguments("-rna_nr=%s" % self.m5rna_clust) searchJob.addArguments("-size=100") searchJob.addArguments("-proc=8") searchJob.uses("100.preprocess.passed.fna", link=Link.INPUT) searchJob.uses("425.search.rna.fna", link=Link.OUTPUT, transfer=False) searchJob.profile("globus", "maxwalltime", "120") searchJob.profile("globus", "hostcount", "8") searchJob.profile("globus", "count", "8") dax.addJob(searchJob) dax.depends(searchJob, preprocessJob) # CLuster (Search RNA) Job cluster2Job = Job("wrapper-cluster", node_label="wrapper-cluster") cluster2Job.addArguments("-input=425.search.rna.fna") cluster2Job.addArguments("-out_prefix=440.cluster") cluster2Job.addArguments("-rna") cluster2Job.addArguments("-pid=%s" % self.rna_pid) cluster2Job.addArguments("-memory=20") cluster2Job.uses("425.search.rna.fna", link=Link.INPUT) cluster2Job.uses("440.cluster.rna%s.fna" % self.rna_pid, link=Link.OUTPUT, transfer=False) cluster2Job.uses("440.cluster.rna%s.mapping" % self.rna_pid, link=Link.OUTPUT, transfer=False) cluster2Job.profile("globus", "maxwalltime", "30") dax.addJob(cluster2Job) dax.depends(cluster2Job, searchJob) # Blat_rna Job blatrnaJob = Job("wrapper-blat-rna", node_label="wrapper-blat-rna") blatrnaJob.addArguments("--input=440.cluster.rna%s.fna" % self.rna_pid) blatrnaJob.addArguments("-rna_nr=m5rna") blatrnaJob.addArguments("--output=450.rna.sims") blatrnaJob.addArguments("-assembled=%s" % self.assembled) blatrnaJob.uses("440.cluster.rna%s.fna" % self.rna_pid, link=Link.INPUT) blatrnaJob.uses("450.rna.sims", link=Link.OUTPUT, transfer=False) blatrnaJob.profile("globus", "maxwalltime", "20") dax.addJob(blatrnaJob) dax.depends(blatrnaJob, cluster2Job) # Annotate Sims (Blat RNA) Job annotatesims2Job = Job("wrapper-annotate-sims", node_label="wrapper-annotate-sims") annotatesims2Job.addArguments("-input=450.rna.sims") annotatesims2Job.addArguments("-out_prefix=450") annotatesims2Job.addArguments("-rna") annotatesims2Job.addArguments("-ach_ver=%s" % self.ach_annotation_ver) annotatesims2Job.addArguments("-ann_file=m5nr_v1.bdb") annotatesims2Job.uses("450.rna.sims", link=Link.INPUT) annotatesims2Job.uses("450.rna.sims.filter", link=Link.OUTPUT, transfer=False) annotatesims2Job.uses("450.rna.expand.rna", link=Link.OUTPUT, transfer=False) annotatesims2Job.uses("450.rna.expand.lca", link=Link.OUTPUT, transfer=False) annotatesims2Job.profile("globus", "maxwalltime", "30") dax.addJob(annotatesims2Job) dax.depends(annotatesims2Job, blatrnaJob) # Index Sim Seq Job indexJob = Job("wrapper-index", node_label="wrapper-index") indexJob.addArguments("-in_seqs=350.genecalling.coding.fna") indexJob.addArguments("-in_seqs=425.search.rna.fna") indexJob.addArguments("-in_maps=550.cluster.aa%s.mapping" % self.aa_pid) indexJob.addArguments("-in_maps=440.cluster.rna%s.mapping" % self.rna_pid) indexJob.addArguments("-in_sims=650.aa.sims.filter") indexJob.addArguments("-in_sims=450.rna.sims.filter") indexJob.addArguments("-output=700.annotation.sims.filter.seq") indexJob.addArguments("-ach_ver=%s" % self.ach_annotation_ver) indexJob.addArguments("-memory=10") indexJob.addArguments("-ann_file=m5nr_v1.bdb") indexJob.uses("350.genecalling.coding.fna", link=Link.INPUT) indexJob.uses("550.cluster.aa%s.mapping" % self.aa_pid, link=Link.INPUT) indexJob.uses("650.aa.sims.filter", link=Link.INPUT) indexJob.uses("425.search.rna.fna", link=Link.INPUT) indexJob.uses("440.cluster.rna%s.mapping" % self.rna_pid, link=Link.INPUT) indexJob.uses("450.rna.sims.filter", link=Link.INPUT) indexJob.uses("700.annotation.sims.filter.seq", link=Link.OUTPUT, transfer=False) indexJob.uses("700.annotation.sims.filter.seq.index", link=Link.OUTPUT, transfer=False) indexJob.profile("globus", "maxwalltime", "120") dax.addJob(indexJob) dax.depends(indexJob, geneJob) dax.depends(indexJob, cluster1Job) dax.depends(indexJob, cluster2Job) dax.depends(indexJob, searchJob) dax.depends(indexJob, annotatesims1Job) # Annotate Summary Job (13) summary13Job = Job("wrapper-summary", node_label="wrapper-summary") summary13Job.addArguments("-job=1") summary13Job.addArguments("-in_expand=650.aa.expand.protein") summary13Job.addArguments("-in_expand=450.rna.expand.rna") summary13Job.addArguments("-in_maps=550.cluster.aa%s.mapping" % self.aa_pid) summary13Job.addArguments("-in_maps=440.cluster.rna%s.mapping" % self.rna_pid) summary13Job.addArguments("-in_assemb=075.assembly.coverage") summary13Job.addArguments("-in_index=700.annotation.sims.filter.seq.index") summary13Job.addArguments("-output=700.annotation.md5.summary") summary13Job.addArguments("-nr_ver=%s" % self.ach_annotation_ver) summary13Job.addArguments("-type=md5") summary13Job.uses("075.assembly.coverage", link=Link.INPUT) summary13Job.uses("550.cluster.aa%s.mapping" % self.aa_pid, link=Link.INPUT) summary13Job.uses("650.aa.expand.protein", link=Link.INPUT) summary13Job.uses("440.cluster.rna%s.mapping" % self.rna_pid, link=Link.INPUT) summary13Job.uses("450.rna.expand.rna", link=Link.INPUT) summary13Job.uses("700.annotation.sims.filter.seq.index", link=Link.INPUT) summary13Job.uses("700.annotation.md5.summary", link=Link.OUTPUT, transfer=True) summary13Job.profile("globus", "maxwalltime", "30") dax.addJob(summary13Job) dax.depends(summary13Job, qcJob) dax.depends(summary13Job, cluster1Job) dax.depends(summary13Job, cluster2Job) dax.depends(summary13Job, indexJob) dax.depends(summary13Job, annotatesims1Job) dax.depends(summary13Job, annotatesims2Job) # Annotate Summary Job (14) summary14Job = Job("wrapper-summary", node_label="wrapper-summary") summary14Job.addArguments("-job=1") summary14Job.addArguments("-in_expand=650.aa.expand.protein") summary14Job.addArguments("-in_expand=450.rna.expand.rna") summary14Job.addArguments("-in_maps=550.cluster.aa%s.mapping" % self.aa_pid) summary14Job.addArguments("-in_maps=440.cluster.rna%s.mapping" % self.rna_pid) summary14Job.addArguments("-in_assemb=075.assembly.coverage") summary14Job.addArguments("-output=700.annotation.function.summary") summary14Job.addArguments("-nr_ver=%s" % self.ach_annotation_ver) summary14Job.addArguments("-type=function") summary14Job.uses("075.assembly.coverage", link=Link.INPUT) summary14Job.uses("550.cluster.aa%s.mapping" % self.aa_pid, link=Link.INPUT) summary14Job.uses("650.aa.expand.protein", link=Link.INPUT) summary14Job.uses("440.cluster.rna%s.mapping" % self.rna_pid, link=Link.INPUT) summary14Job.uses("450.rna.expand.rna", link=Link.INPUT) summary14Job.uses("700.annotation.function.summary", link=Link.OUTPUT, transfer=True) summary14Job.profile("globus", "maxwalltime", "30") dax.addJob(summary14Job) dax.depends(summary14Job, qcJob) dax.depends(summary14Job, cluster1Job) dax.depends(summary14Job, cluster2Job) dax.depends(summary14Job, annotatesims1Job) dax.depends(summary14Job, annotatesims2Job) # Annotate Summary Job (15) summary15Job = Job("wrapper-summary", node_label="wrapper-summary") summary15Job.addArguments("-job=1") summary15Job.addArguments("-in_expand=650.aa.expand.protein") summary15Job.addArguments("-in_expand=450.rna.expand.rna") summary15Job.addArguments("-in_maps=550.cluster.aa%s.mapping" % self.aa_pid) summary15Job.addArguments("-in_maps=440.cluster.rna%s.mapping" % self.rna_pid) summary15Job.addArguments("-in_assemb=075.assembly.coverage") summary15Job.addArguments("-output=700.annotation.organism.summary") summary15Job.addArguments("-nr_ver=%s" % self.ach_annotation_ver) summary15Job.addArguments("-type=organism") summary15Job.uses("075.assembly.coverage", link=Link.INPUT) summary15Job.uses("550.cluster.aa%s.mapping" % self.aa_pid, link=Link.INPUT) summary15Job.uses("650.aa.expand.protein", link=Link.INPUT) summary15Job.uses("440.cluster.rna%s.mapping" % self.rna_pid, link=Link.INPUT) summary15Job.uses("450.rna.expand.rna", link=Link.INPUT) summary15Job.uses("700.annotation.organism.summary", link=Link.OUTPUT, transfer=True) summary15Job.profile("globus", "maxwalltime", "30") dax.addJob(summary15Job) dax.depends(summary15Job, qcJob) dax.depends(summary15Job, cluster1Job) dax.depends(summary15Job, cluster2Job) dax.depends(summary15Job, annotatesims1Job) dax.depends(summary15Job, annotatesims2Job) # Annotate Summary Job (16) summary16Job = Job("wrapper-summary", node_label="wrapper-summary") summary16Job.addArguments("-job=1") summary16Job.addArguments("-in_expand=650.aa.expand.lca") summary16Job.addArguments("-in_expand=450.rna.expand.lca") summary16Job.addArguments("-in_maps=550.cluster.aa%s.mapping" % self.aa_pid) summary16Job.addArguments("-in_maps=440.cluster.rna%s.mapping" % self.rna_pid) summary16Job.addArguments("-in_assemb=075.assembly.coverage") summary16Job.addArguments("-output=700.annotation.lca.summary") summary16Job.addArguments("-nr_ver=%s" % self.ach_annotation_ver) summary16Job.addArguments("-type=lca") summary16Job.uses("075.assembly.coverage", link=Link.INPUT) summary16Job.uses("550.cluster.aa%s.mapping" % self.aa_pid, link=Link.INPUT) summary16Job.uses("650.aa.expand.lca", link=Link.INPUT) summary16Job.uses("440.cluster.rna%s.mapping" % self.rna_pid, link=Link.INPUT) summary16Job.uses("450.rna.expand.lca", link=Link.INPUT) summary16Job.uses("700.annotation.lca.summary", link=Link.OUTPUT, transfer=True) summary16Job.profile("globus", "maxwalltime", "30") dax.addJob(summary16Job) dax.depends(summary16Job, qcJob) dax.depends(summary16Job, cluster1Job) dax.depends(summary16Job, cluster2Job) dax.depends(summary16Job, annotatesims1Job) dax.depends(summary16Job, annotatesims2Job) # Annotate Summary Job (17) summary17Job = Job("wrapper-summary", node_label="wrapper-summary") summary17Job.addArguments("-job=1") summary17Job.addArguments("-in_expand=650.aa.expand.ontology") summary17Job.addArguments("-in_maps=550.cluster.aa%s.mapping" % self.aa_pid) summary17Job.addArguments("-in_assemb=075.assembly.coverage") summary17Job.addArguments("-output=700.annotation.ontology.summary") summary17Job.addArguments("-nr_ver=%s" % self.ach_annotation_ver) summary17Job.addArguments("-type=ontology") summary17Job.uses("075.assembly.coverage", link=Link.INPUT) summary17Job.uses("550.cluster.aa%s.mapping" % self.aa_pid, link=Link.INPUT) summary17Job.uses("650.aa.expand.ontology", link=Link.INPUT) summary17Job.uses("700.annotation.ontology.summary", link=Link.OUTPUT, transfer=True) summary17Job.profile("globus", "maxwalltime", "30") dax.addJob(summary17Job) dax.depends(summary17Job, qcJob) dax.depends(summary17Job, cluster1Job) dax.depends(summary17Job, annotatesims1Job) # Annotate Summary Job (18) summary18Job = Job("wrapper-summary", node_label="wrapper-summary") summary18Job.addArguments("-job=1") summary18Job.addArguments("-in_expand=650.aa.expand.protein") summary18Job.addArguments("-in_expand=450.rna.expand.rna") summary18Job.addArguments("-in_maps=550.cluster.aa%s.mapping" % self.aa_pid) summary18Job.addArguments("-in_maps=440.cluster.rna%s.mapping" % self.rna_pid) summary18Job.addArguments("-in_assemb=075.assembly.coverage") summary18Job.addArguments("-output=700.annotation.source.stats") summary18Job.addArguments("-nr_ver=%s" % self.ach_annotation_ver) summary18Job.addArguments("-type=source") summary18Job.uses("075.assembly.coverage", link=Link.INPUT) summary18Job.uses("550.cluster.aa%s.mapping" % self.aa_pid, link=Link.INPUT) summary18Job.uses("650.aa.expand.protein", link=Link.INPUT) summary18Job.uses("440.cluster.rna%s.mapping" % self.rna_pid, link=Link.INPUT) summary18Job.uses("450.rna.expand.rna", link=Link.INPUT) summary18Job.uses("700.annotation.source.stats", link=Link.OUTPUT, transfer=True) summary18Job.profile("globus", "maxwalltime", "30") dax.addJob(summary18Job) dax.depends(summary18Job, qcJob) dax.depends(summary18Job, cluster1Job) dax.depends(summary18Job, cluster2Job) dax.depends(summary18Job, annotatesims1Job) dax.depends(summary18Job, annotatesims2Job) # Write the DAX file dax.writeXMLFile(self.daxfile) # Generate the replica catalog self.generate_replica_catalog()
def run_python_on_parameters( self, job_name: Locator, python_module: Any, parameters: Union[Parameters, Dict[str, Any]], *, depends_on, resource_request: Optional[ResourceRequest] = None, override_conda_config: Optional[CondaConfiguration] = None, category: Optional[str] = None, ) -> DependencyNode: """ Schedule a job to run the given *python_module* on the given *parameters*. If this job requires other jobs to be executed first, include them in *depends_on*. This method returns a `DependencyNode` which can be used in *depends_on* for future jobs. """ job_dir = self.directory_for(job_name) ckpt_name = job_name / "___ckpt" checkpoint_path = job_dir / "___ckpt" depends_on = _canonicalize_depends_on(depends_on) if isinstance(python_module, str): fully_qualified_module_name = python_module else: fully_qualified_module_name = fully_qualified_name(python_module) # allow users to specify the parameters as a dict for convenience if not isinstance(parameters, Parameters): parameters = Parameters.from_mapping(parameters) # If we've already scheduled this identical job, # then don't schedule it again. params_sink = CharSink.to_string() YAMLParametersWriter().write(parameters, params_sink) signature = (fully_qualified_module_name, params_sink.last_string_written) if signature in self._signature_to_job: logging.info("Job %s recognized as a duplicate", job_name) return self._signature_to_job[signature] script_path = job_dir / "___run.sh" stdout_path = parameters.string( "logfile", default=str((job_dir / "___stdout.log").absolute())) self._conda_script_generator.write_shell_script_to( entry_point_name=fully_qualified_module_name, parameters=parameters, working_directory=job_dir, script_path=script_path, params_path=job_dir / "____params.params", stdout_file=stdout_path, ckpt_path=checkpoint_path, override_conda_config=override_conda_config, ) script_executable = Executable( namespace=self._namespace, name=str(job_name).replace("/", "_"), version="4.0", os="linux", arch="x86_64", ) script_executable.addPFN( path_to_pfn(script_path, site=self._default_site)) if not self._job_graph.hasExecutable(script_executable): self._job_graph.addExecutable(script_executable) job = Job(script_executable) self._job_graph.addJob(job) for parent_dependency in depends_on: if parent_dependency.job: self._job_graph.depends(job, parent_dependency.job) for out_file in parent_dependency.output_files: job.uses(out_file, link=Link.INPUT) if resource_request is not None: resource_request = self.default_resource_request.unify( resource_request) else: resource_request = self.default_resource_request if category: job.profile(Namespace.DAGMAN, "category", category) resource_request.apply_to_job(job, job_name=self._job_name_for(job_name)) # Handle Output Files # This is currently only handled as the checkpoint file # See: https://github.com/isi-vista/vista-pegasus-wrapper/issues/25 checkpoint_pegasus_file = path_to_pegasus_file(checkpoint_path, site=self._default_site, name=f"{ckpt_name}") if checkpoint_pegasus_file not in self._added_files: self._job_graph.addFile(checkpoint_pegasus_file) self._added_files.add(checkpoint_pegasus_file) # If the checkpoint file already exists, we want to add it to the replica catalog # so that we don't run the job corresponding to the checkpoint file again if checkpoint_path.exists(): with self._replica_catalog.open("a+") as handle: handle.write( f"{ckpt_name} file://{checkpoint_path} site={self._default_site}\n" ) job.uses(checkpoint_pegasus_file, link=Link.OUTPUT, transfer=True) dependency_node = DependencyNode.from_job( job, output_files=[checkpoint_pegasus_file]) self._signature_to_job[signature] = dependency_node logging.info("Scheduled Python job %s", job_name) return dependency_node
def generate_dax(self): "Generate a workflow (DAX, config files, and replica catalog)" ts = datetime.utcnow().strftime('%Y%m%dT%H%M%SZ') dax = ADAG("refinement-%s" % ts) # These are all the global input files for the workflow coordinates = File(self.coordinates) parameters = File(self.parameters) extended_system = File(self.extended_system) topfile = File(self.topfile) sassena_db = File(self.sassena_db) incoherent_db = File(self.incoherent_db) coherent_db = File(self.coherent_db) # This job untars the sassena db and makes it available to the other # jobs in the workflow untarjob = Job("tar", node_label="untar") if self.is_synthetic_workflow: untarjob.addArguments("-p", "-xzvf", sassena_db.name) untarjob.addArguments("-a", "tar") for output_file in [ "incoherent_db", "coherent_db" ]: untarjob.addArguments(self.keg_params.output_file("tar", output_file, eval(output_file).name)) self.keg_params.add_keg_params(untarjob) else: untarjob.addArguments("-xzvf", sassena_db) untarjob.uses(sassena_db, link=Link.INPUT) untarjob.uses(incoherent_db, link=Link.OUTPUT, transfer=False) untarjob.uses(coherent_db, link=Link.OUTPUT, transfer=False) untarjob.profile("globus", "jobtype", "single") untarjob.profile("globus", "maxwalltime", "1") untarjob.profile("globus", "count", "1") dax.addJob(untarjob) # For each charge that was listed in the config file for charge in self.charges: structure = "Q%s.psf" % charge # Equilibrate files eq_conf = File("equilibrate_%s.conf" % charge) eq_coord = File("equilibrate_%s.restart.coord" % charge) eq_xsc = File("equilibrate_%s.restart.xsc" % charge) eq_vel = File("equilibrate_%s.restart.vel" % charge) # Production files prod_conf = File("production_%s.conf" % charge) prod_dcd = File("production_%s.dcd" % charge) # Ptraj files ptraj_conf = File("ptraj_%s.conf" % charge) ptraj_fit = File("ptraj_%s.fit" % charge) ptraj_dcd = File("ptraj_%s.dcd" % charge) # Sassena incoherent files incoherent_conf = File("sassenaInc_%s.xml" % charge) fqt_incoherent = File("fqt_inc_%s.hd5" % charge) # Sassena coherent files coherent_conf = File("sassenaCoh_%s.xml" % charge) fqt_coherent = File("fqt_coh_%s.hd5" % charge) # Generate psf and configuration files for this charge pipeline self.generate_psf(charge) self.generate_eq_conf(charge, structure) self.generate_prod_conf(charge, structure) self.generate_ptraj_conf(charge) self.generate_incoherent_conf(charge) self.generate_coherent_conf(charge) # Equilibrate job eqjob = Job("namd", node_label="namd_eq_%s" % charge) if self.is_synthetic_workflow: eqjob.addArguments("-p", eq_conf) eqjob.addArguments("-a", "namd_eq_%s" % charge) eqjob.addArguments("-i", eq_conf.name, structure, coordinates.name, parameters.name, extended_system.name) task_label = "namd-eq" for output_file in [ "eq_coord", "eq_xsc", "eq_vel" ]: eqjob.addArguments(self.keg_params.output_file(task_label, output_file, eval(output_file).name)) self.keg_params.add_keg_params(eqjob, task_label) else: eqjob.addArguments(eq_conf) eqjob.uses(eq_conf, link=Link.INPUT) eqjob.uses(structure, link=Link.INPUT) eqjob.uses(coordinates, link=Link.INPUT) eqjob.uses(parameters, link=Link.INPUT) eqjob.uses(extended_system, link=Link.INPUT) eqjob.uses(eq_coord, link=Link.OUTPUT, transfer=False) eqjob.uses(eq_xsc, link=Link.OUTPUT, transfer=False) eqjob.uses(eq_vel, link=Link.OUTPUT, transfer=False) if self.is_synthetic_workflow: eqjob.profile("globus", "jobtype", "mpi") eqjob.profile("globus", "maxwalltime", "1") eqjob.profile("globus", "count", "8") else: eqjob.profile("globus", "jobtype", "mpi") eqjob.profile("globus", "maxwalltime", self.getconf("equilibrate_maxwalltime")) eqjob.profile("globus", "count", self.getconf("equilibrate_cores")) dax.addJob(eqjob) # Production job prodjob = Job("namd", node_label="namd_prod_%s" % charge) if self.is_synthetic_workflow: prodjob.addArguments("-p", prod_conf) prodjob.addArguments("-a", "namd_prod_%s" % charge) prodjob.addArguments("-i", prod_conf.name, structure, coordinates.name, parameters.name, eq_coord.name, eq_xsc.name, eq_vel.name) task_label = "namd-prod" prodjob.addArguments(self.keg_params.output_file(task_label, "prod_dcd", prod_dcd.name)) self.keg_params.add_keg_params(prodjob, task_label) else: prodjob.addArguments(prod_conf) prodjob.uses(prod_conf, link=Link.INPUT) prodjob.uses(structure, link=Link.INPUT) prodjob.uses(coordinates, link=Link.INPUT) prodjob.uses(parameters, link=Link.INPUT) prodjob.uses(eq_coord, link=Link.INPUT) prodjob.uses(eq_xsc, link=Link.INPUT) prodjob.uses(eq_vel, link=Link.INPUT) prodjob.uses(prod_dcd, link=Link.OUTPUT, transfer=True) if self.is_synthetic_workflow: prodjob.profile("globus", "jobtype", "mpi") prodjob.profile("globus", "maxwalltime", "6") prodjob.profile("globus", "count", "8") else: prodjob.profile("globus", "jobtype", "mpi") prodjob.profile("globus", "maxwalltime", self.getconf("production_maxwalltime")) prodjob.profile("globus", "count", self.getconf("production_cores")) dax.addJob(prodjob) dax.depends(prodjob, eqjob) # ptraj job ptrajjob = Job(namespace="amber", name="ptraj", node_label="amber_ptraj_%s" % charge) if self.is_synthetic_workflow: ptrajjob.addArguments("-p", topfile) ptrajjob.addArguments("-a", "amber_ptraj_%s" % charge) ptrajjob.addArguments("-i", topfile.name, ptraj_conf.name, prod_dcd.name) task_label = "amber-ptraj" for output_file in [ "ptraj_fit", "ptraj_dcd" ]: ptrajjob.addArguments(self.keg_params.output_file(task_label, output_file, eval(output_file).name)) self.keg_params.add_keg_params(ptrajjob, task_label) else: ptrajjob.addArguments(topfile) ptrajjob.setStdin(ptraj_conf) ptrajjob.uses(topfile, link=Link.INPUT) ptrajjob.uses(ptraj_conf, link=Link.INPUT) ptrajjob.uses(prod_dcd, link=Link.INPUT) ptrajjob.uses(ptraj_fit, link=Link.OUTPUT, transfer=True) ptrajjob.uses(ptraj_dcd, link=Link.OUTPUT, transfer=True) ptrajjob.profile("globus", "jobtype", "single") ptrajjob.profile("globus", "maxwalltime", self.getconf("ptraj_maxwalltime")) ptrajjob.profile("globus", "count", self.getconf("ptraj_cores")) dax.addJob(ptrajjob) dax.depends(ptrajjob, prodjob) # sassena incoherent job incojob = Job("sassena", node_label="sassena_inc_%s" % charge) if self.is_synthetic_workflow: incojob.addArguments("-p", "--config", incoherent_conf) incojob.addArguments("-a", "sassena_inc_%s" % charge) incojob.addArguments("-i", incoherent_conf.name, ptraj_dcd.name, incoherent_db.name, coordinates.name) task_label = "sassena-inc" incojob.addArguments(self.keg_params.output_file(task_label, "fqt_incoherent", fqt_incoherent.name)) self.keg_params.add_keg_params(incojob, task_label) else: incojob.addArguments("--config", incoherent_conf) incojob.uses(incoherent_conf, link=Link.INPUT) incojob.uses(ptraj_dcd, link=Link.INPUT) incojob.uses(incoherent_db, link=Link.INPUT) incojob.uses(coordinates, link=Link.INPUT) incojob.uses(fqt_incoherent, link=Link.OUTPUT, transfer=True) if self.is_synthetic_workflow: incojob.profile("globus", "jobtype", "mpi") incojob.profile("globus", "maxwalltime", "6") incojob.profile("globus", "count", "8") else: incojob.profile("globus", "jobtype", "mpi") incojob.profile("globus", "maxwalltime", self.getconf("sassena_maxwalltime")) incojob.profile("globus", "count", self.getconf("sassena_cores")) dax.addJob(incojob) dax.depends(incojob, ptrajjob) dax.depends(incojob, untarjob) # sassena coherent job cojob = Job("sassena", node_label="sassena_coh_%s" % charge) if self.is_synthetic_workflow: cojob.addArguments("-p", "--config", coherent_conf) cojob.addArguments("-a", "sassena_coh_%s" % charge) cojob.addArguments("-i", coherent_conf.name, ptraj_dcd.name, coherent_db.name, coordinates.name) task_label = "sassena-coh" cojob.addArguments(self.keg_params.output_file(task_label, "fqt_coherent", fqt_coherent.name)) self.keg_params.add_keg_params(cojob, task_label) else: cojob.addArguments("--config", coherent_conf) cojob.uses(coherent_conf, link=Link.INPUT) cojob.uses(ptraj_dcd, link=Link.INPUT) cojob.uses(coherent_db, link=Link.INPUT) cojob.uses(coordinates, link=Link.INPUT) cojob.uses(fqt_coherent, link=Link.OUTPUT, transfer=True) if self.is_synthetic_workflow: cojob.profile("globus", "jobtype", "mpi") cojob.profile("globus", "maxwalltime", "6") cojob.profile("globus", "count", "8") else: cojob.profile("globus", "jobtype", "mpi") cojob.profile("globus", "maxwalltime", self.getconf("sassena_maxwalltime")) cojob.profile("globus", "count", self.getconf("sassena_cores")) dax.addJob(cojob) dax.depends(cojob, prodjob) dax.depends(cojob, untarjob) # Write the DAX file dax.writeXMLFile(self.daxfile)