def __init__(self, namespace=None, throttle=0, memsize=20, time_limit=48, ssh_key=None, local_workdir='.'): self.config = Config() if namespace is None: self.namespace = str(os.getpid()) else: self.namespace = namespace # These will default to the config cluster working directory. self.runner = ClusterJobRunner() self.submitter = ClusterJobSubmitter() self.memsize = memsize # expressed in GB self.time_limit = time_limit self.throttle = throttle self.ssh_key = ssh_key local_workdir = os.path.abspath(local_workdir) if not os.path.exists(local_workdir): os.mkdir(local_workdir) self.local_workdir = local_workdir
def run_hicup(self): # Copy files # NB! There is vulnerability in below as we asssume input file follows odom lab convention code = self.fq1.split('_')[0] destination = "%s@%s:%s" % (self.conf.user, self.conf.cluster, self.conf.clusterworkdir) LOGGER.info("Copying %s to cluster ..." % self.fq1) transfer_file(self.fq1, destination) if self.fq2 is not None: LOGGER.info("Copying %s to cluster ..." % self.fq2) transfer_file(self.fq2, destination) # Send bsub for running hicup with correct number of thread request submitter = ClusterJobSubmitter() # FIX ME: Yes, its bad practice to hard code dependencies but this is a temporary fix as in some reason hicup can not be found even though in path # Moreover, in some reason softlinking hicup to bin does not seem to be enough, probably beacuse the way dependencies in hicup main program are implemented. cmd = "mkdir %s && sleep 1 && cd %s && ~/software/external/hicup_v0.5.10/hicup --config %s && rm %s && rm %s" % ( self.hicup_output_dir, self.conf.clusterworkdir, self.hicup_conf_fname, self.fq2, self.hicup_conf_fname) jobid = submitter.submit_command(cmd=cmd, mem=self.conf.clustermem, auto_requeue=False, threads=self.conf.num_threads) LOGGER.info("Hicup execution job id = %s" % jobid) # cmd = "cd %s && cs_run_hicup_postprocess.py --fq1 %s" % ( self.conf.clusterworkdir, self.fq1) jobid = submitter.submit_command(cmd=cmd, mem=self.conf.clustermem, auto_requeue=False, threads=self.conf.num_threads, depend_jobs=[jobid]) LOGGER.info("Hicup post process job id = %s" % jobid)
def ed_run_post_process(self, merged_fn, bams, source_path, merged_path, compressed_path, print_commands_only=False): # Setup cluster job submitter submitter = ClusterJobSubmitter() cleanup = "" if self.cleanup: cleanup = " && rm %s" % (" ".join(bams)) # Execute merge job merge_cmd = "cd %s && samtools merge -u -@ 8 %s %s%s" % ( source_path, os.path.join(merged_path, merged_fn), " ".join(bams), cleanup) print merge_cmd if not print_commands_only: jobid_merge = submitter.submit_command(cmd=merge_cmd, mem=8000, mincpus=8, auto_requeue=False) # Execute duplicate marking if self.cleanup: cleanup = " && rm %s" % merged_fn dupmark_fn = merged_fn + "_dupmark.bam" dupmark_log = merged_fn + "_dupmark.log" d_cmd = "cd %s && picard --Xmx 32g MarkDuplicates I=%s O=%s M=%s VALIDATION_STRINGENCY=SILENT ASSUME_SORTED=True COMPRESSION_LEVEL=0 MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=1024" % ( merged_path, merged_fn, dupmark_fn, dupmark_log) d_cmd = d_cmd + cleanup print d_cmd if not print_commands_only: jobid_d = submitter.submit_command(cmd=d_cmd, mem=50000, auto_requeue=False, depend_jobs=[jobid_merge]) # Set up coverage and and flagstat computations for marked duplicate files coverage_fn = dupmark_fn + ".coverage" flagstat_fn = dupmark_fn + ".flagstat" genome_size_fn = None try: genome = Genome.objects.filter( library__code=merged_fn.split('_')[0]) fasta = genome[0].fasta genome_size_fn = os.path.join(DBCONF.clustergenomedir, fasta + '.size') except Genome.DoesNotExist, _err: LOGGER.info("No reference genome for %s." % (merged_fn)) sys.exit(1)
def ed_download(self, print_download_commands_only=False): '''Downloads all project related files.''' # Set some variables for submitting the download jobs to cluster. submitter = ClusterJobSubmitter() # Install information about files locations, ids etc. self.ed_check() # Set some variables for threaded downloading. # We will try to control for the number of threads by # setting new download jobs to depend on complete of previous download jobs # The assumption is that in average the run time of all download threads will be the same. tnr = 0 jobids = [] newids = [] for edstem in self.edfiles: status = self.edfiles[edstem].status if status != 'new': if status == 'downloaded': LOGGER.info("Skipping %s (%s)", edstem, status) if status == 'processed': LOGGER.info("Skipping %s (%s)", edstem, status) continue ## Prepare download command for for submision to the cluster cmd = 'cs_edinburgh_download.py -a --file1 %s --file2 %s --file1_md5 %s --file2_md5 %s -p %s -l %d' % ( self.edfiles[edstem].file1, self.edfiles[edstem].file2, self.edfiles[edstem].file1_md5, self.edfiles[edstem].file2_md5, self.project, self.edfiles[edstem].laneid) if print_download_commands_only: print cmd continue # Submit download job if jobids: jobid = submitter.submit_command(cmd=cmd, mem=1000, auto_requeue=False, depend_jobs=[jobids[tnr]]) else: jobid = submitter.submit_command(cmd=cmd, mem=1000, auto_requeue=False) LOGGER.info("Setting up downloads for %s ... (jobid=%s)", edstem, jobid) newids.append(int(jobid)) tnr += 1 if tnr == self.athreads: jobids = newids newids = [] tnr = 0
def ed_process(self, print_commands_only=False): '''Process files that have been labeled as downloaded''' # Set some variables for submitting the download jobs to cluster. submitter = ClusterJobSubmitter() # Check the project files first self.ed_check() django.setup() # Fetch archive location in the file system. try: archive = ArchiveLocation.objects.get(name=self.aname) except ArchiveLocation.DoesNotExist, _err: raise SystemExit("Archive location \'%s\' does not exist!" % self.aname)
class ClusterJobManager(object): ''' Moderately abstract base class providing some methods and attributes commonly used in higher-level cluster process management classes (e.g. GsnapManager, LastzManager). ''' __slots__ = ('namespace', 'submitter', 'runner', 'config', 'throttle', 'memsize', 'ssh_key', 'local_workdir', 'time_limit') def __init__(self, namespace=None, throttle=0, memsize=20, time_limit=48, ssh_key=None, local_workdir='.'): self.config = Config() if namespace is None: self.namespace = str(os.getpid()) else: self.namespace = namespace # These will default to the config cluster working directory. self.runner = ClusterJobRunner() self.submitter = ClusterJobSubmitter() self.memsize = memsize # expressed in GB self.time_limit = time_limit self.throttle = throttle self.ssh_key = ssh_key local_workdir = os.path.abspath(local_workdir) if not os.path.exists(local_workdir): os.mkdir(local_workdir) self.local_workdir = local_workdir def submit_command(self, cmd, *args, **kwargs): ''' Submit a command to be run via bsub on the cluster. Returns the ID of the launched job. To wait on the completion of the submitted job, see the wait_on_cluster method. ''' return self.submitter.submit_command(cmd, *args, **kwargs) def run_command(self, cmd, *args, **kwargs): ''' Run a command directly on the cluster head node, waiting for the result to be returned. Returns a file descriptor containing the stdout of the job. This method is typically used for commands which should complete almost immediately. ''' return self.runner.run_command(cmd, *args, **kwargs) def cluster_file_exists(self, file): ''' Test whether a file exists in the configured cluster working directory. ''' cmd = ("if [ -e %s ]; then echo yes; else echo no; fi" % file) LOGGER.debug(cmd) # This runs the test command in the cluster working directory. with self.runner.run_command(cmd) as ofh: first_line = ofh.readline() first_line = first_line.rstrip('\n') return first_line == 'yes' def cluster_jobs_count(self): ''' Return a count of the jobs currently running on the cluster for the configured user. ''' cmd = ("bjobs -u %s" % self.config.clusteruser) LOGGER.debug(cmd) count = 0 with self.runner.run_command(cmd) as ofh: for line in ofh: count += 1 # Account for the extra header line. return count - 1 def return_file_to_localhost(self, clusterout, outfile, execute=True, donefile=False): ''' If execute is False, returns a command string that can be used to transfer a cluster output files back to our local working directory. If execute is True, the command will also be run on the cluster. ''' myhost = getfqdn() myuser = getuser() sshcmd = "scp" # Transferring the files back to localhost requires an appropriate # passwordless ssh key to be given access on our localhost. The # alternative is some horrendous pexpect hack which is only a # little more secure (see: sshSangerTunnel.py). if self.ssh_key is not None: sshcmd += " -i %s" % self.ssh_key # Note that we need quoting of e.g. file paths containing # spaces. Also, the initial './' allows filenames to contain # colons. if not os.path.isabs(clusterout): clusterout = './%s' % (clusterout, ) sshcmd += ( r' %s %s@%s:\"' % (bash_quote(clusterout), myuser, myhost) + bash_quote(bash_quote(self.local_workdir + r'/%s' % outfile)) + r'\"') if donefile: sshcmd += " && ssh" if self.ssh_key is not None: sshcmd += " -i %s" % self.ssh_key sshcmd += (r' %s@%s touch ' % (myuser, myhost) + bash_quote( bash_quote(self.local_workdir + r'/%s.done' % outfile))) if execute is True: # This *should* die on failure. self.runner.run_command(sshcmd) return sshcmd def wait_on_cluster(self, jobs, cleanup_cmd=None): ''' Wait for the alignment jobs running on the cluster to contact a designated socket file location to indicate that the jobs have finished. ''' # Set up a job to notify localhost that the cluster is finished. with NamedTemporaryFile() as sobj: socketfile = sobj.name # The nc utility is pretty commonly installed; if it is not, this # will not work. LOGGER.info("Submitting monitor job to the cluster.") cmd = "ssh" if self.ssh_key is not None: cmd += " -i %s" % self.ssh_key cmd += (" %s@%s 'echo OK | nc -U %s'" % (getuser(), getfqdn(), socketfile)) monjob = self.submitter.submit_command(cmd, depend_jobs=jobs, auto_requeue=False) # Optional clean-up job, typically used to delete temporary files. if cleanup_cmd is not None: LOGGER.info("Submitting clean-up job to the cluster.") self.submitter.submit_command(cleanup_cmd, depend_jobs=[monjob], auto_requeue=False) # Set up a socket server and wait for the cluster to get back to us. LOGGER.info("Waiting on a reply from the cluster...") sock = socket(AF_UNIX, SOCK_STREAM) sock.bind(socketfile) sock.listen(1) (conn, _addr) = sock.accept() message = '' while 1: data = conn.recv(1024) if not data: break message += data conn.close() os.unlink(socketfile) LOGGER.info("Cluster reply received: %s", message) return
def __init__(self, *args, **kwargs): self.job = ClusterJobSubmitter(*args, **kwargs) super(StarClusterJobSubmitter, self).__init__(*args, **kwargs)
class StarClusterJobSubmitter(AlignmentJobRunner): ''' Class representing the submission of a STAR job to the cluster. This class works similarly to BwaClusterJobSubmitter. ''' def __init__(self, *args, **kwargs): self.job = ClusterJobSubmitter(*args, **kwargs) super(StarClusterJobSubmitter, self).__init__(*args, **kwargs) def submit(self, filenames, is_paired=False, destnames=None, cleanup=True, *args, **kwargs): ''' Actually submit the job. The optional destnames argument can be used to name files on the cluster differently to the source. This is occasionally useful. ''' paired_sanity_check(filenames, is_paired) # First, copy the files across and uncompress on the server. We # remove commas here because otherwise tophat is a little too keen # to split on them (quoting doesn't work). LOGGER.info("Copying files to the cluster.") destnames = [ re.sub(',+', '_', os.path.basename(fname)) for fname in filenames ] destnames = self.job.transfer_data(filenames, destnames) # Next, create flag for cleanup if cleanup: cleanupflag = '--cleanup' else: cleanupflag = '' if self.samplename: sampleflag = '--sample %s' % self.samplename else: sampleflag = '' # This now searches directly on the cluster. progpath = self.job.find_remote_executable('cs_runStarWithSplit.py', path=self.conf.clusterpath) # Next, submit the actual jobs on the actual cluster. fnlist = " ".join([quote(x) for x in destnames]) cmd = ("python %s --loglevel %d %s --rcp %s:%s %s %s %s" % (progpath, LOGGER.getEffectiveLevel(), cleanupflag, self.conf.datahost, self.finaldir, sampleflag, self.genome, fnlist)) LOGGER.info("Submitting STAR job to cluster.") self.job.submit_command(cmd, *args, **kwargs) @classmethod def build_genome_index_path(cls, genome, *args, **kwargs): # Import here rather than main file as otherwise cluster operations fail. from ..models import Program conf = Config() # Get information about default aligner, check that the program is # in path and try to predict its version. alignerinfo = ProgramSummary('STAR', ssh_host=conf.cluster, ssh_port=conf.clusterport, ssh_user=conf.clusteruser, ssh_path=conf.clusterpath) indexdir = None # Check that the version of aligner has been registered in # repository. try: Program.objects.get(program=alignerinfo.program, version=alignerinfo.version, current=True) indexdir = "%s_%s" % ('STAR', alignerinfo.version) except Program.DoesNotExist, _err: sys.exit(( """Aligner "%s" version "%s" found at path "%s" """ % (alignerinfo.program, alignerinfo.version, alignerinfo.path)) + "not recorded as current in repository! Quitting.") # Build path to STAR genome dir. Note that STAR takes dir name only without indexdir.fa suffix in the end. gpath = genome_fasta_path(genome, indexdir=indexdir, genomedir=conf.clustergenomedir) # A bit of an ugly hack here: Remove indexdir.fa suffix from gpath created by genome_fasta_path gpath = os.path.split(gpath)[0] return gpath
class BwaClusterJobSubmitter(AlignmentJobRunner): '''Class representing the submission of a bwa job to the cluster. This class in fact uploads the fastq file, gunzips it if necessary, and then submits a job to split the fastq file into chunks and run the alignments as secondary jobs, also spawning one last job which waits for the first to complete before merging the output and copying it back to the source server.''' def __init__(self, *args, **kwargs): self.job = ClusterJobSubmitter(*args, **kwargs) super(BwaClusterJobSubmitter, self).__init__(*args, **kwargs) def submit(self, filenames, is_paired=False, destnames=None, cleanup=True, nocc=None, bwa_algorithm='aln', fileshost=None, nosplit=False, rcp=None, lcp=None, *args, **kwargs): '''Actually submit the job. The optional destnames argument can be used to name files on the cluster differently to the source. This is occasionally useful.''' assert (bwa_algorithm in ('aln', 'mem')) paired_sanity_check(filenames, is_paired) # by lukk01: # NB! Copying files to cluster is not any more necessary as long s the hostflag = '--fileshost %s' is uncommented below. # However, this would be a pull rather than push and we should then pull from the archive for process_file.py # It would requre, though, re-writing of data processing and alignment orders in process_file.py. Just a throught. # # First, copy the files across and uncompress on the server. LOGGER.info("Copying files to the cluster.") destnames = self.job.transfer_data(filenames, destnames) # Next, create flag for cleanup cleanupflag = '--cleanup' if cleanup else '' # Next, create flag for number of non-unique reads to keep in samse/sampe noccflag = ('--n_occ %s' % (nocc, )) if nocc else '' # Sample names containing spaces are bad on the command line, # and potentially problematic in bam read groups. sampleflag = '--sample %s' % self.samplename if self.samplename else '' # Whether to run bwa mem or aln. algoflag = '--algorithm %s' % bwa_algorithm # Deal with default values for fileshost and rcp/lcp. I.e. figure out if files are located in cluster and results would need to be copied somewhere or not. cpflag = '' hostflag = '' filehost = gethostname() if filehost != self.conf.cluster: # hostflag = '--fileshost %s' % filehost cpflag = '--rcp %s:%s' % (self.conf.datahost, self.finaldir) else: # the files are already in host. Override cleanup to prevent source files to be deleted. LOGGER.info( "Input files are local. Overriding --cleanup to prevent files being deleted." ) cleanupflag = '' cpflag = '--lcp %s' % self.finaldir # If fileshost has been specified, override default if fileshost is not None: hostflag = '--fileshost %s' % fileshost # If rcp has been specified, override default if rcp is not None: cpflag = '--rcp %s' % rcp # If lcp has been specified, override default if lcp is not None: cpflag = '--lcp %s' % lcp # If nosplit has been set, forward the value splitflag = '' if nosplit is not None: splitflag = '--no-split' # This now searches directly on the cluster. progpath = self.job.find_remote_executable('cs_runBwaWithSplit.py', path=self.conf.clusterpath) if progpath is None: raise StandardError( "cs_runBwaWithSplit.py not found on clusterpath. Possible misconfiguration?" ) # Next, submit the actual jobs on the actual cluster. if is_paired: LOGGER.debug("Running bwa on paired-end sequencing input.") fnlist = " ".join([quote(x) for x in filenames]) # fnlist = " ".join([ quote(x) for x in destnames ]) ## FIXME think about ways this could be improved. ## In the submitted command: ## --rcp is where cs_runBwaWithSplit_Merge.py eventually copies ## the reassembled bam file (via scp). cmd = ("python %s --loglevel %d %s %s %s %s %s %s %s %s %s" % (progpath, LOGGER.getEffectiveLevel(), cleanupflag, hostflag, noccflag, cpflag, splitflag, sampleflag, algoflag, self.genome, fnlist)) else: LOGGER.debug("Running bwa on single-end sequencing input.") fnlist = quote(filenames[0]) # fnlist = quote(destnames[0]) cmd = ("python %s --loglevel %d %s %s %s %s %s %s %s %s %s" % (progpath, LOGGER.getEffectiveLevel(), cleanupflag, hostflag, noccflag, cpflag, splitflag, sampleflag, algoflag, self.genome, fnlist)) LOGGER.info("Submitting bwa job to cluster.") self.job.submit_command(cmd, *args, **kwargs) @classmethod def build_genome_index_path(cls, genome, *args, **kwargs): # Import here rather than main file as otherwise cluster operations fail. from ..models import Program conf = Config() # Get information about default aligner, check that the program is # in path and try to predict its version. alignerinfo = ProgramSummary(conf.aligner, ssh_host=conf.cluster, ssh_port=conf.clusterport, ssh_user=conf.clusteruser, ssh_path=conf.clusterpath) indexdir = None # Check that the version of aligner has been registered in # repository. try: Program.objects.get(program=alignerinfo.program, version=alignerinfo.version, current=True) indexdir = "%s-%s" % (alignerinfo.program, alignerinfo.version) # If aligner version is missing, try to insert it into the database # (FIXME not yet implemented while we see how this works). except Program.DoesNotExist, _err: sys.exit(( """Aligner "%s" version "%s" found at path "%s" """ % (alignerinfo.program, alignerinfo.version, alignerinfo.path)) + "not recorded as current in repository! Quitting.") gpath = genome_fasta_path(genome, indexdir=indexdir, genomedir=conf.clustergenomedir) return gpath