class PipelineCommandWrapper(PipelineCommand): """ Class for constructing program command lines This class is based on the PipelineCommand class but can be used directly (rather than needing to be subclassed). For example, to wrap the 'ls' command directly: >>> ls_command = PipelineCommandWrapper("List directory",'ls',dirn) It is also possible to extend the command line using the 'add_args' method, for example: >>> ls_command = PipelineCommandWrapper("List directory",'ls') >>> ls.command.add_args(dirn) """ def __init__(self, name, *args): """ Create a new PipelineCommandWrapper instance Arguments: name (str): arbitrary name for the command args (List): initial list of arguments making up the command """ PipelineCommand.__init__(self, *args) self._name = str(name) self._cmd = None if args: self._cmd = Command(*args) def add_args(self, *args): """ Add additional arguments to extend the command being built Arguments: args (List): one or more arguments to append to the command """ if self._cmd is None: self._cmd = Command(*args) else: self._cmd.add_args(*args) def init(self, *args): """ Internal: dummy init which does nothing """ pass def cmd(self): """ Internal: implement the 'cmd' method """ return self._cmd
def add_args(self, *args): """ Add additional arguments to extend the command being built Arguments: args (List): one or more arguments to append to the command """ if self._cmd is None: self._cmd = Command(*args) else: self._cmd.add_args(*args)
def info_func(p): name = os.path.basename(p) exe = find_program(p) version = '' output = Command(exe).subprocess_check_output()[1] for line in output.split('\n'): if line.startswith(name): try: version = line.split()[1] except IndexError: pass break return (exe,name.upper(),version)
def __init__(self, name, *args): """ Create a new PipelineCommandWrapper instance Arguments: name (str): arbitrary name for the command args (List): initial list of arguments making up the command """ PipelineCommand.__init__(self, *args) self._name = str(name) self._cmd = None if args: self._cmd = Command(*args)
def fastq_screen_tag(conf_file, fastq_in, out_dir, aligner=None, threads=1, tempdir=None): """ Run 'fastq_screen' and output tagged fastq file Raises an Exception in the event of an error. Arguments: conf_file (str): path to the fastq_screen .conf file fastq_in (str): path to the FASTQ file to screen out_dir (str): path to the output directory to put the tagged FASTQ in aligner (str): optional, name of the aligner to pass to fastq_screen (default: don't specify the aligner) threads (int): optional, the number of threads to use when running fastq_screen (default: 1) tempdir (str): optional, directory to create temporary working directories in when running fastq_screen Returns: String: path to the tagged output FASTQ file """ # Make a temporary working directory work_dir = tempfile.mkdtemp(suffix='.fastq_screen', dir=tempdir) # Build fastq_screen command fastq_screen_cmd = Command('fastq_screen', '--subset', 0, '--threads', threads, '--conf', conf_file, '--tag', '--outdir', work_dir) if args.aligner is not None: fastq_screen_cmd.add_args('--aligner', args.aligner) fastq_screen_cmd.add_args(fastq_in) print "Running %s" % fastq_screen_cmd # Run the command exit_code = fastq_screen_cmd.run_subprocess(working_dir=work_dir) if exit_code != 0: err_msg = "Screening %s against %s failed (exit code %d)" % \ (fastq_in,conf_file,exit_code) else: # Handle the outputs tagged_fastq = os.path.basename(strip_ext(fastq_in,'.fastq')) \ + '.tagged.fastq' if not os.path.exists(os.path.join(work_dir, tagged_fastq)): err_msg = "Failed to generated tagged fastq file %s" % \ tagged_fastq exit_code = 1 else: os.rename(os.path.join(work_dir, tagged_fastq), os.path.join(out_dir, tagged_fastq)) # Clean up working directory shutil.rmtree(work_dir) # Raise exception if there was a problem if exit_code != 0: raise Exception(err_msg) # Return path to tagged file return os.path.join(out_dir, tagged_fastq)
def batch_fastqs(fastqs,batch_size,basename="batched", out_dir=None): """ Splits reads from one or more Fastqs into batches Concatenates input Fastq files and then splits reads into smaller Fastqs using the external 'batch' utility. Arguments: fastqs (list): list of paths to one or more Fastq files to take reads from batch_size (int): number of reads to allocate to each batch basename (str): optional basename to use for the output Fastq files (default: 'batched') out_dir (str): optional path to a directory where the batched Fastqs will be written """ # Determine number of batches nreads = get_read_count(fastqs) nbatches = nreads/batch_size if nbatches*batch_size < nreads: nbatches += 1 print "Creating %d batches of %d reads" % (nbatches, batch_size) assert(batch_size*nbatches >= nreads) # Check if fastqs are compressed gzipped = fastqs[0].endswith('.gz') if gzipped: batch_cmd = Command('zcat') else: batch_cmd = Command('cat') # Get the read number read_number = get_read_number(fastqs[0]) suffix = ".r%s.fastq" % read_number # Build and run the batching command batch_cmd.add_args(*fastqs) batch_cmd.add_args('|', 'split', '-l',batch_size*4, '-d', '-a',3, '--additional-suffix=%s' % suffix, '-', os.path.join(out_dir,"%s.B" % basename)) batch_script = os.path.join(out_dir,"batch.sh") batch_cmd.make_wrapper_script("/bin/bash", batch_script) # Check for successful exit code retcode = Command("/bin/bash", batch_script).run_subprocess( working_dir=out_dir) if retcode != 0: raise Exception("Batching failed: exit code %s" % retcode) print "Batching completed" # Collect and return the batched Fastq names batched_fastqs = [os.path.join(out_dir, "%s.B%03d%s" % (basename,i,suffix)) for i in xrange(0,nbatches)] return batched_fastqs
print "QC report: %s" % out_file # Run the QC announce("Running QC") max_jobs = __settings.general.max_concurrent_jobs sched = SimpleScheduler(runner=qc_runner, max_concurrent=max_jobs) sched.start() for sample in samples: print "Checking/setting up for sample '%s'" % sample.name for fq in sample.fastq: if sample.verify_qc(qc_dir,fq): print "-- %s: QC ok" % fq else: print "-- %s: setting up QC" % fq qc_cmd = Command('illumina_qc.sh',fq) if args.nthreads > 1: qc_cmd.add_args('--threads',args.nthreads) qc_cmd.add_args('--subset',args.fastq_screen_subset, '--qc_dir',qc_dir) job = sched.submit(qc_cmd, wd=project.dirn, name="%s.%s" % (qc_base, os.path.basename(fq)), log_dir=log_dir) print "Job: %s" % job # Wait for the scheduler to run all jobs sched.wait() sched.stop() # Verify the QC
def cmd(self): return Command("echo", self._txt)
class PipelineTask(object): """ Base class defining a 'task' to run as part of a pipeline A 'task' wraps one or more external programs which can be run concurrently, and which produces a set of outputs. Individual programs should be wrapped in instances of the 'PipelineCommand' class. This class should be subclassed to implement the 'init', 'setup', 'finish' (optionally) and 'output' methods. The 'add_cmd' method can be used within 'setup' to add one or 'PipelineCommand' instances. """ def __init__(self, _name, *args, **kws): """ Create a new PipelineTask instance Arguments: name (str): an arbitrary user-friendly name for the task instance args (List): list of arguments to be supplied to the subclass (must match those defined in the 'init' method) kws (Dictionary): dictionary of keyword-value pairs to be supplied to the subclass (must match those defined in the 'init' method) """ self._name = str(_name) self._args = args self._kws = kws self._commands = [] self._task_name = "%s.%s" % (sanitize_name(self._name), uuid.uuid4()) self._completed = False self._stdout_files = [] self._exit_code = 0 # Working directory self._working_dir = None # Running jobs self._jobs = [] self._groups = [] # Deal with subclass arguments try: self._callargs = inspect.getcallargs(self.init, *args, **kws) except Exception as ex: logger.error("Exception setting up args for task '%s' (%s): %s" % (self._name, self.__class__, ex)) raise ex try: del (self._callargs['self']) except KeyError: pass # Execute the init method self.invoke(self.init, self._args, self._kws) @property def args(self): """ Fetch parameters supplied to the instance """ return AttributeDictionary(**self._callargs) @property def completed(self): """ Check if the task has completed """ return self._completed @property def exit_code(self): """ Get the exit code for completed task Returns: Integer: exit code, or 'None' if task hasn't completed """ if not self.completed: return None else: return self._exit_code @property def stdout(self): """ Get the standard output from the task Returns: String: standard output from the task. """ stdout = [] for f in self._stdout_files: with open(f, 'r') as fp: stdout.append(fp.read()) return ''.join(stdout) def name(self): """ Get the name of the task within the pipeline Returns: String: a name consisting of a 'sanitized' version of the supplied name appended with a unique id code """ return self._task_name def fail(self, exit_code=1, message=None): """ Register the task as failing Intended to be invoked from the subclassed 'setup' or 'finish' methods, to terminate the task and indicate that it has failed. Arguments: exit_code (int): optional, specifies the exit code to return (defaults to 1) message (str): optional, error message to report to the pipeline user """ if message: self.report("failed: %s" % message) self.report("failed: exit code set to %s" % exit_code) self._completed = True self._exit_code = exit_code def report(self, s): """ Internal: report messages from the task """ print "%s [Task: %s] %s" % (time.strftime("%Y-%m-%d %H:%M:%S"), self._name, s) def invoke(self, f, args=None, kws=None): """ Internal: invoke arbitrary method on the task Arguments: f (function): method to invoke (e.g. 'self.init') args (list): arguments to invoke function with kws (dictionary): keyworded parameters to invoke function with """ # Switch to working directory, if defined if self._working_dir is not None: current_dir = os.getcwd() os.chdir(self._working_dir) # Invoke the requested method try: with Capturing() as output: if args is None: f() else: f(*args, **kws) self.report("done '%s'" % f.__name__) for line in output: self.report("%s STDOUT: %s" % (f.__name__, line)) except NotImplementedError: pass except Exception as ex: self.report("exception invoking '%s': %s" % (f.__name__, ex)) traceback.print_exc(ex) self._exit_code += 1 # Switch back to original directory if self._working_dir is not None: os.chdir(current_dir) def task_completed(self, name, jobs, sched): """ Internal: callback method This is a callback method which is invoked when scheduled jobs in the task finish Arguments: name (str): name for the callback jobs (list): list of SchedulerJob instances sched (SimpleScheduler): scheduler instance """ for job in jobs: try: if job.exit_code != 0: self._exit_code += 1 self._stdout_files.append(job.log) except AttributeError: # Assume it's a group for j in job.jobs: if j.exit_code != 0: self._exit_code += 1 self._stdout_files.append(j.log) self.finish_task() def finish_task(self): """ Internal: perform actions to finish the task """ if self._exit_code != 0: logger.critical("%s failed: exit code %s" % (self._name, self._exit_code)) else: # Execute 'finish', if implemented self.invoke(self.finish) # Flag job as completed self._completed = True self.report("%s completed" % self._name) def add_cmd(self, pipeline_job): """ Add a PipelineCommand to the task Arguments: pipeline_job (PipelineCommand): a PipelineCommand instance to be executed by the task when it runs """ self._commands.append(pipeline_job) def run(self, sched=None, runner=None, working_dir=None, log_dir=None, scripts_dir=None, wait_for=(), async=True): """ Run the task This method is not normally invoked directly; instead it's called by the pipeline that the task has been added to. Arguments: sched (SimpleScheduler): scheduler to submit jobs to runner (JobRunner): job runner to use when running jobs via the scheduler working_dir (str): path to the working directory to use (defaults to the current working directory) log_dir (str): path to the directory to write logs to (defaults to the working directory) scripts_dir (str): path to the directory to write scripts to (defaults to the working directory) wait_for (list): deprecated: list of scheduler jobs to wait for before running jobs from this task async (bool): if False then block until the task has completed """ # Initialise if working_dir is None: working_dir = os.getcwd() self._working_dir = os.path.abspath(working_dir) if scripts_dir is None: scripts_dir = self._working_dir if log_dir is None: log_dir = self._working_dir # Do setup self.invoke(self.setup) # Generate commands to run cmds = [] for command in self._commands: self.report("%s" % command.cmd()) script_file = command.make_wrapper_script(scripts_dir=scripts_dir) cmd = Command('/bin/bash', script_file) self.report("wrapper script %s" % script_file) cmds.append(cmd) # Run the commands if cmds: use_group = (len(cmds) != 1) if use_group: # Run as a group group = sched.group(self.name()) for j, cmd in enumerate(cmds): name = "%s#%s" % (self.name(), j) group.add(cmd, wd=self._working_dir, name=name, runner=runner, log_dir=log_dir, wait_for=wait_for) group.close() callback_name = group.name callback_function = self.task_completed self._groups.append(group) else: # Run a single job cmd = cmds[0] name = self.name() job = sched.submit(cmd, wd=self._working_dir, name=name, runner=runner, log_dir=log_dir, wait_for=wait_for) callback_name = job.name callback_function = self.task_completed self._jobs.append(job) # Set up a callback which the scheduler will invoke # in background when the jobs complete sched.callback("%s" % self._name, callback_function, wait_for=(callback_name, )) if not async: # Wait for job or group to complete before returning while not self.completed: time.sleep(5) else: # No commands to execute self.finish_task() return self