def start( self ): """ Invoked at boot time or when the mesosbox service is started. """ while not os.path.exists( '/tmp/cloud-init.done' ): log.info( "Waiting for cloud-init to finish ..." ) time.sleep( 1 ) log.info( "Starting mesosbox" ) self.__patch_etc_hosts( { 'mesos-master': self.master_ip } ) self.__mount_ebs_volume( ) self.__create_lazy_dirs( ) if self.master_ip == self.node_ip: node_type = 'master' else: node_type = 'slave' self._copy_dir_from_master( shared_dir ) log_path = '/var/log/mesosbox/mesos{}'.format( node_type ) mkdir_p( log_path ) os.chown( log_path, self.uid, self.gid ) log.info( "Starting %s services" % node_type ) check_call( [ initctl, 'emit', 'mesosbox-start-%s' % node_type ] )
def __mount_ebs_volume(self): """ Attach, format (if necessary) and mount the EBS volume with the same cluster ordinal as this node. """ ebs_volume_size = self.instance_tag('ebs_volume_size') or '0' ebs_volume_size = int(ebs_volume_size) if ebs_volume_size: instance_name = self.instance_tag('Name') cluster_ordinal = int(self.instance_tag('cluster_ordinal')) volume_name = '%s__%d' % (instance_name, cluster_ordinal) volume = EC2VolumeHelper(ec2=self.ec2, availability_zone=self.availability_zone, name=volume_name, size=ebs_volume_size, volume_type="gp2") # TODO: handle case where volume is already attached device_ext = '/dev/sdf' device = '/dev/xvdf' volume.attach(self.instance_id, device_ext) # Wait for inode to appear and make sure its a block device while True: try: assert stat.S_ISBLK(os.stat(device).st_mode) break except OSError as e: if e.errno == errno.ENOENT: time.sleep(1) else: raise # Only format empty volumes volume_label = volume_label_hash(volume_name) if check_output(['file', '-sL', device]).strip() == device + ': data': check_call(['mkfs', '-t', 'ext4', device]) check_call(['e2label', device, volume_label]) else: # If the volume is not empty, verify the file system label actual_label = check_output(['e2label', device]).strip() if actual_label != volume_label: raise AssertionError( "Expected volume label '%s' (derived from '%s') but got '%s'" % (volume_label, volume_name, actual_label)) current_mount_point = self.__mount_point(device) if current_mount_point is None: mkdir_p(self.persistent_dir) check_call(['mount', device, self.persistent_dir]) elif current_mount_point == self.persistent_dir: pass else: raise RuntimeError( "Can't mount device %s on '%s' since it is already mounted on '%s'" % (device, self.persistent_dir, current_mount_point)) else: # No persistent volume is attached and the root volume is off limits, so we will need # to place persistent data on the ephemeral volume. self.persistent_dir = self.ephemeral_dir
def download_sample_and_align(job, sample, inputs, ids): """ Downloads the sample and runs BWA-kit :param JobFunctionWrappingJob job: Passed by Toil automatically :param tuple(str, list) sample: UUID and URLS for sample :param Namespace inputs: Contains input arguments :param dict ids: FileStore IDs for shared inputs """ uuid, urls = sample r1_url, r2_url = urls if len(urls) == 2 else (urls[0], None) job.fileStore.logToMaster( 'Downloaded sample: {0}. R1 {1}\nR2 {2}\nStarting BWA Run'.format( uuid, r1_url, r2_url)) # Read fastq samples from file store ids['r1'] = job.addChildJobFn(download_url_job, r1_url, s3_key_path=inputs.ssec, disk=inputs.file_size).rv() if r2_url: ids['r2'] = job.addChildJobFn(download_url_job, r2_url, s3_key_path=inputs.ssec, disk=inputs.file_size).rv() else: ids['r2'] = None # Create config for bwakit inputs.cores = min(inputs.maxCores, multiprocessing.cpu_count()) inputs.uuid = uuid config = dict( **vars(inputs) ) # Create config as a copy of inputs since it has values we want config.update(ids) # Overwrite attributes with the FileStoreIDs from ids config = argparse.Namespace(**config) # Define and wire job functions bam_id = job.wrapJobFn(run_bwakit, config, sort=inputs.sort, trim=inputs.trim, disk=inputs.file_size, cores=inputs.cores) job.addFollowOn(bam_id) output_name = uuid + '.bam' + str( inputs.suffix) if inputs.suffix else uuid + '.bam' if urlparse(inputs.output_dir).scheme == 's3': bam_id.addChildJobFn(s3am_upload_job, file_id=bam_id.rv(), file_name=output_name, s3_dir=inputs.output_dir, s3_key_path=inputs.ssec, cores=inputs.cores, disk=inputs.file_size) else: mkdir_p(inputs.ouput_dir) bam_id.addChildJobFn(copy_file_job, name=output_name, file_id=bam_id.rv(), output_dir=inputs.output_dir, disk=inputs.file_size)
def __mount_ebs_volume( self ): """ Attach, format (if necessary) and mount the EBS volume with the same cluster ordinal as this node. """ ebs_volume_size = self.__get_instance_tag( self.instance_id, 'ebs_volume_size' ) or '0' ebs_volume_size = int( ebs_volume_size ) if ebs_volume_size: instance_name = self.__get_instance_tag( self.instance_id, 'Name' ) cluster_ordinal = int( self.__get_instance_tag( self.instance_id, 'cluster_ordinal' ) ) volume_name = '%s__%d' % (instance_name, cluster_ordinal) volume = EC2VolumeHelper( ec2=self.ec2, availability_zone=self.availability_zone, name=volume_name, size=ebs_volume_size, volume_type="gp2") # TODO: handle case where volume is already attached device_ext = '/dev/sdf' device = '/dev/xvdf' volume.attach( self.instance_id, device_ext ) # Wait for inode to appear and make sure its a block device while True: try: assert stat.S_ISBLK( os.stat( device ).st_mode ) break except OSError as e: if e.errno == errno.ENOENT: time.sleep( 1 ) else: raise # Only format empty volumes volume_label = volume_label_hash( volume_name ) if check_output( [ 'file', '-sL', device ] ).strip( ) == device + ': data': check_call( [ 'mkfs', '-t', 'ext4', device ] ) check_call( [ 'e2label', device, volume_label ] ) else: # if the volume is not empty, verify the file system label actual_label = check_output( [ 'e2label', device ] ).strip( ) if actual_label != volume_label: raise AssertionError( "Expected volume label '%s' (derived from '%s') but got '%s'" % (volume_label, volume_name, actual_label) ) current_mount_point = self.__mount_point( device ) if current_mount_point is None: mkdir_p( self.persistent_dir ) check_call( [ 'mount', device, self.persistent_dir ] ) elif current_mount_point == self.persistent_dir: pass else: raise RuntimeError( "Can't mount device %s on '%s' since it is already mounted on '%s'" % ( device, self.persistent_dir, current_mount_point) ) else: # No persistent volume is attached and the root volume is off limits, so we will need # to place persistent data on the ephemeral volume. self.persistent_dir = self.ephemeral_dir
def setUpClass(cls): super(ToilTest, cls).setUpClass() cls._tempDirs = [] tempBaseDir = os.environ.get('TOIL_TEST_TEMP', None) if tempBaseDir is not None and not os.path.isabs(tempBaseDir): tempBaseDir = os.path.abspath(os.path.join(cls._projectRootPath(), tempBaseDir)) mkdir_p(tempBaseDir) cls._tempBaseDir = tempBaseDir
def __create_lazy_dirs( self ): log.info( "Bind-mounting directory structure" ) for (parent, name, persistent) in self.lazy_dirs: assert parent[ 0 ] == os.path.sep location = self.persistent_dir if persistent else self.ephemeral_dir physical_path = os.path.join( location, parent[ 1: ], name ) mkdir_p( physical_path ) os.chown( physical_path, self.uid, self.gid ) logical_path = os.path.join( parent, name ) check_call( [ 'mount', '--bind', physical_path, logical_path ] )
def consolidate_output(job, config, mutect, pindel, muse): """ Combine the contents of separate tarball outputs into one via streaming :param JobFunctionWrappingJob job: passed automatically by Toil :param Namespace config: Argparse Namespace object containing argument inputs :param str mutect: MuTect tarball FileStoreID :param str pindel: Pindel tarball FileStoreID :param str muse: MuSe tarball FileStoreID """ work_dir = job.fileStore.getLocalTempDir() mutect_tar, pindel_tar, muse_tar = None, None, None if mutect: mutect_tar = job.fileStore.readGlobalFile( mutect, os.path.join(work_dir, 'mutect.tar.gz')) if pindel: pindel_tar = job.fileStore.readGlobalFile( pindel, os.path.join(work_dir, 'pindel.tar.gz')) if muse: muse_tar = job.fileStore.readGlobalFile( muse, os.path.join(work_dir, 'muse.tar.gz')) out_tar = os.path.join(work_dir, config.uuid + '.tar.gz') # Consolidate separate tarballs into one as streams (avoids unnecessary untaring) tar_list = [x for x in [mutect_tar, pindel_tar, muse_tar] if x is not None] with tarfile.open(os.path.join(work_dir, out_tar), 'w:gz') as f_out: for tar in tar_list: with tarfile.open(tar, 'r') as f_in: for tarinfo in f_in: with closing(f_in.extractfile(tarinfo)) as f_in_file: if tar is mutect_tar: tarinfo.name = os.path.join( config.uuid, 'mutect', os.path.basename(tarinfo.name)) elif tar is pindel_tar: tarinfo.name = os.path.join( config.uuid, 'pindel', os.path.basename(tarinfo.name)) else: tarinfo.name = os.path.join( config.uuid, 'muse', os.path.basename(tarinfo.name)) f_out.addfile(tarinfo, fileobj=f_in_file) # Move to output location if urlparse(config.output_dir).scheme == 's3': job.fileStore.logToMaster('Uploading {} to S3: {}'.format( config.uuid, config.output_dir)) s3am_upload(job=job, fpath=out_tar, s3_dir=config.output_dir, num_cores=config.cores) else: job.fileStore.logToMaster('Moving {} to output dir: {}'.format( config.uuid, config.output_dir)) mkdir_p(config.output_dir) copy_files(file_paths=[out_tar], output_dir=config.output_dir)
def consolidate_output_tarballs(job, inputs, vcqc_id, spladder_id): """ Combine the contents of separate tarballs into one. :param JobFunctionWrappingJob job: passed by Toil automatically :param Namespace inputs: Stores input arguments (see main) :param str vcqc_id: FileStore ID of variant calling and QC tarball :param str spladder_id: FileStore ID of spladder tarball """ job.fileStore.logToMaster('Consolidating files and uploading: {}'.format( inputs.uuid)) work_dir = job.fileStore.getLocalTempDir() # Retrieve IDs uuid = inputs.uuid # Unpack IDs # Retrieve output file paths to consolidate vcqc_tar = job.fileStore.readGlobalFile( vcqc_id, os.path.join(work_dir, 'vcqc.tar.gz')) spladder_tar = job.fileStore.readGlobalFile( spladder_id, os.path.join(work_dir, 'spladder.tar.gz')) # I/O fname = uuid + '.tar.gz' if not inputs.improper_pair else 'IMPROPER_PAIR' + uuid + '.tar.gz' out_tar = os.path.join(work_dir, fname) # Consolidate separate tarballs into one with tarfile.open(os.path.join(work_dir, out_tar), 'w:gz') as f_out: for tar in [vcqc_tar, spladder_tar]: with tarfile.open(tar, 'r') as f_in: for tarinfo in f_in: with closing(f_in.extractfile(tarinfo)) as f_in_file: if tar == vcqc_tar: tarinfo.name = os.path.join( uuid, 'variants_and_qc', os.path.basename(tarinfo.name)) else: tarinfo.name = os.path.join( uuid, 'spladder', os.path.basename(tarinfo.name)) f_out.addfile(tarinfo, fileobj=f_in_file) # Move to output directory if inputs.output_dir: mkdir_p(inputs.output_dir) shutil.copy(out_tar, os.path.join(inputs.output_dir, os.path.basename(out_tar))) # Upload to S3 if inputs.output_s3_dir: out_id = job.fileStore.writeGlobalFile(out_tar) job.addChildJobFn(s3am_upload_job, file_id=out_id, s3_dir=inputs.output_s3_dir, file_name=fname, key_path=inputs.ssec, cores=inputs.cores)
def __create_lazy_dirs( self ): log.info( "Bind-mounting directory structure" ) for (parent, name, persistent) in self.lazy_dirs: assert parent[ 0 ] == os.path.sep logical_path = os.path.join( parent, name ) if persistent is None: tag = 'persist' + logical_path.replace( os.path.sep, '_' ) persistent = less_strict_bool( self.instance_tag( tag ) ) location = self.persistent_dir if persistent else self.ephemeral_dir physical_path = os.path.join( location, parent[ 1: ], name ) mkdir_p( physical_path ) os.chown( physical_path, self.uid, self.gid ) check_call( [ 'mount', '--bind', physical_path, logical_path ] )
def __create_lazy_dirs(self): log.info("Bind-mounting directory structure") for (parent, name, persistent) in self.lazy_dirs: assert parent[0] == os.path.sep logical_path = os.path.join(parent, name) if persistent is None: tag = 'persist' + logical_path.replace(os.path.sep, '_') persistent = less_strict_bool(self.instance_tag(tag)) location = self.persistent_dir if persistent else self.ephemeral_dir physical_path = os.path.join(location, parent[1:], name) mkdir_p(physical_path) os.chown(physical_path, self.uid, self.gid) check_call(['mount', '--bind', physical_path, logical_path])
def consolidate_output(job, config, kallisto_output, graphical_output): """ Combines the contents of the outputs into one tarball and places in output directory or s3 :param JobFunctionWrappingJob job: passed automatically by Toil :param Namespace config: Argparse Namespace object containing argument inputs :param str kallisto_output: FileStoreID for Kallisto output :param str graphical_output: FileStoreID for output of graphing step """ job.fileStore.logToMaster('Consolidating output: {}'.format(config.uuid)) work_dir = job.fileStore.getLocalTempDir() graphical_tar, kallisto_tar = None, None # Retrieve output file paths to consolidate if kallisto_output: kallisto_tar = job.fileStore.readGlobalFile( kallisto_output, os.path.join(work_dir, 'kallisto_output.tar.gz')) if graphical_output: graphical_tar = job.fileStore.readGlobalFile( graphical_output, os.path.join(work_dir, 'single_cell_plots.tar.gz')) # I/O out_tar = os.path.join(work_dir, config.uuid + '.tar.gz') # Consolidate separate tarballs into one as streams (avoids unnecessary untaring) tar_list = [x for x in [graphical_tar, kallisto_tar] if x is not None] with tarfile.open(out_tar, 'w:gz') as f_out: for tar in tar_list: with tarfile.open(tar, 'r') as f_in: for tarinfo in f_in: with closing(f_in.extractfile(tarinfo)) as f_in_file: if tar == kallisto_tar: tarinfo.name = os.path.join( config.uuid, os.path.basename(tarinfo.name)) elif tar == graphical_tar: tarinfo.name = os.path.join( config.uuid, 'plots', os.path.basename(tarinfo.name)) f_out.addfile(tarinfo, fileobj=f_in_file) # Move to output location if urlparse(config.output_dir).scheme == 's3': job.fileStore.logToMaster('Uploading {} to S3: {}'.format( config.uuid, config.output_dir)) s3am_upload(fpath=out_tar, s3_dir=config.output_dir, num_cores=config.cores) else: job.fileStore.logToMaster('Moving {} to output dir: {}'.format( config.uuid, config.output_dir)) mkdir_p(config.output_dir) copy_files( file_paths=[os.path.join(work_dir, config.uuid + '.tar.gz')], output_dir=config.output_dir)
def consolidate_output(job, config, kallisto_output, rsem_output, fastqc_output): """ Combines the contents of the outputs into one tarball and places in output directory or s3 :param JobFunctionWrappingJob job: passed automatically by Toil :param Namespace config: Argparse Namespace object containing argument inputs :param str kallisto_output: FileStoreID for Kallisto output :param tuple(str, str) rsem_output: FileStoreIDs for RSEM output :param str fastqc_output: FileStoreID for FastQC output """ job.fileStore.logToMaster('Consolidating input: {}'.format(config.uuid)) work_dir = job.fileStore.getLocalTempDir() # Retrieve output file paths to consolidate rsem_tar, hugo_tar, kallisto_tar, fastqc_tar = None, None, None, None if rsem_output: rsem_id, hugo_id = rsem_output rsem_tar = job.fileStore.readGlobalFile(rsem_id, os.path.join(work_dir, 'rsem.tar.gz')) hugo_tar = job.fileStore.readGlobalFile(hugo_id, os.path.join(work_dir, 'rsem_hugo.tar.gz')) if kallisto_output: kallisto_tar = job.fileStore.readGlobalFile(kallisto_output, os.path.join(work_dir, 'kallisto.tar.gz')) if fastqc_output: fastqc_tar = job.fileStore.readGlobalFile(fastqc_output, os.path.join(work_dir, 'fastqc.tar.gz')) # I/O if not config.paired: config.uuid = 'SINGLE-END.{}'.format(config.uuid) out_tar = os.path.join(work_dir, config.uuid + '.tar.gz') # Consolidate separate tarballs into one as streams (avoids unnecessary untaring) tar_list = [x for x in [rsem_tar, hugo_tar, kallisto_tar, fastqc_tar] if x is not None] with tarfile.open(os.path.join(work_dir, out_tar), 'w:gz') as f_out: for tar in tar_list: with tarfile.open(tar, 'r') as f_in: for tarinfo in f_in: with closing(f_in.extractfile(tarinfo)) as f_in_file: if tar == rsem_tar: tarinfo.name = os.path.join(config.uuid, 'RSEM', os.path.basename(tarinfo.name)) elif tar == hugo_tar: tarinfo.name = os.path.join(config.uuid, 'RSEM', 'Hugo', os.path.basename(tarinfo.name)) elif tar == kallisto_tar: tarinfo.name = os.path.join(config.uuid, 'Kallisto', os.path.basename(tarinfo.name)) else: tarinfo.name = os.path.join(config.uuid, 'QC', os.path.basename(tarinfo.name)) f_out.addfile(tarinfo, fileobj=f_in_file) # Move to output directory if config.output_dir: job.fileStore.logToMaster('Moving {} to output dir: {}'.format(config.uuid, config.output_dir)) mkdir_p(config.output_dir) copy_files(file_paths=[os.path.join(work_dir, config.uuid + '.tar.gz')], output_dir=config.output_dir) # Upload to S3 if config.s3_output_dir: job.fileStore.logToMaster('Uploading {} to S3: {}'.format(config.uuid, config.s3_output_dir)) s3am_upload(fpath=out_tar, s3_dir=config.s3_output_dir, num_cores=config.cores)
def _copy_dir_from_master( self, dir ): if dir: mkdir_p( dir ) while True: try: check_call( [ 'sudo', '-u', 'mesosbox', 'rsync', '-r', '-e', 'ssh -o StrictHostKeyChecking=no', "mesos-master:" + dir, dir ] ) except: log.warning( "Failed to rsync specified directory, trying again in 10 sec" ) time.sleep( 10 ) else: break os.chown( dir, self.uid, self.gid )
def start(self): while not os.path.exists( '/tmp/cloud-init.done' ): log.info( "Waiting for cloud-init to finish ..." ) time.sleep( 1 ) self.__patch_etc_hosts( { 'mesos-master': self.master_ip } ) if self.master_ip == self.node_ip: node_type = 'master' else: node_type = 'slave' log_path='/var/log/mesosbox/mesos{}'.format(node_type) mkdir_p(log_path) os.chown( log_path, self.uid, self.gid ) log.info( "Starting %s services" % node_type ) check_call( [initctl, 'emit', 'mesosbox-start-%s' % node_type ] )
def _testExternal(self, moduleName, pyFiles): dirPath = self._createTempDir() pycFiles = set(pyFile + 'c' for pyFile in pyFiles) for relPath in pyFiles: path = os.path.join(dirPath, relPath) mkdir_p(os.path.dirname(path)) with open(path, 'w') as f: f.write('pass\n') sys.path.append(dirPath) try: userScript = importlib.import_module(moduleName) try: self._test(userScript.__name__, expectedContents=pycFiles) finally: del userScript del sys.modules[moduleName] self.assertFalse(moduleName in sys.modules) finally: sys.path.remove(dirPath)
def consolidate_output(job, config, mutect, pindel, muse): """ Combine the contents of separate tarball outputs into one via streaming :param JobFunctionWrappingJob job: passed automatically by Toil :param Namespace config: Argparse Namespace object containing argument inputs :param str mutect: MuTect tarball FileStoreID :param str pindel: Pindel tarball FileStoreID :param str muse: MuSe tarball FileStoreID """ work_dir = job.fileStore.getLocalTempDir() mutect_tar, pindel_tar, muse_tar = None, None, None if mutect: mutect_tar = job.fileStore.readGlobalFile(mutect, os.path.join(work_dir, 'mutect.tar.gz')) if pindel: pindel_tar = job.fileStore.readGlobalFile(pindel, os.path.join(work_dir, 'pindel.tar.gz')) if muse: muse_tar = job.fileStore.readGlobalFile(muse, os.path.join(work_dir, 'muse.tar.gz')) out_tar = os.path.join(work_dir, config.uuid + '.tar.gz') # Consolidate separate tarballs into one as streams (avoids unnecessary untaring) tar_list = [x for x in [mutect_tar, pindel_tar, muse_tar] if x is not None] with tarfile.open(os.path.join(work_dir, out_tar), 'w:gz') as f_out: for tar in tar_list: with tarfile.open(tar, 'r') as f_in: for tarinfo in f_in: with closing(f_in.extractfile(tarinfo)) as f_in_file: if tar is mutect_tar: tarinfo.name = os.path.join(config.uuid, 'mutect', os.path.basename(tarinfo.name)) elif tar is pindel_tar: tarinfo.name = os.path.join(config.uuid, 'pindel', os.path.basename(tarinfo.name)) else: tarinfo.name = os.path.join(config.uuid, 'muse', os.path.basename(tarinfo.name)) f_out.addfile(tarinfo, fileobj=f_in_file) # Move to output location if urlparse(config.output_dir).scheme == 's3': job.fileStore.logToMaster('Uploading {} to S3: {}'.format(config.uuid, config.output_dir)) s3am_upload(job=job, fpath=out_tar, s3_dir=config.output_dir, num_cores=config.cores) else: job.fileStore.logToMaster('Moving {} to output dir: {}'.format(config.uuid, config.output_dir)) mkdir_p(config.output_dir) copy_files(file_paths=[out_tar], output_dir=config.output_dir)
def consolidate_output_tarballs(job, inputs, vcqc_id, spladder_id): """ Combine the contents of separate tarballs into one. :param JobFunctionWrappingJob job: passed by Toil automatically :param Namespace inputs: Stores input arguments (see main) :param str vcqc_id: FileStore ID of variant calling and QC tarball :param str spladder_id: FileStore ID of spladder tarball """ job.fileStore.logToMaster('Consolidating files and uploading: {}'.format(inputs.uuid)) work_dir = job.fileStore.getLocalTempDir() # Retrieve IDs uuid = inputs.uuid # Unpack IDs # Retrieve output file paths to consolidate vcqc_tar = job.fileStore.readGlobalFile(vcqc_id, os.path.join(work_dir, 'vcqc.tar.gz')) spladder_tar = job.fileStore.readGlobalFile(spladder_id, os.path.join(work_dir, 'spladder.tar.gz')) # I/O fname = uuid + '.tar.gz' if not inputs.improper_pair else 'IMPROPER_PAIR' + uuid + '.tar.gz' out_tar = os.path.join(work_dir, fname) # Consolidate separate tarballs into one with tarfile.open(os.path.join(work_dir, out_tar), 'w:gz') as f_out: for tar in [vcqc_tar, spladder_tar]: with tarfile.open(tar, 'r') as f_in: for tarinfo in f_in: with closing(f_in.extractfile(tarinfo)) as f_in_file: if tar == vcqc_tar: tarinfo.name = os.path.join(uuid, 'variants_and_qc', os.path.basename(tarinfo.name)) else: tarinfo.name = os.path.join(uuid, 'spladder', os.path.basename(tarinfo.name)) f_out.addfile(tarinfo, fileobj=f_in_file) # Move to output directory if inputs.output_dir: mkdir_p(inputs.output_dir) shutil.copy(out_tar, os.path.join(inputs.output_dir, os.path.basename(out_tar))) # Upload to S3 if inputs.output_s3_dir: out_id = job.fileStore.writeGlobalFile(out_tar) job.addChildJobFn(s3am_upload_job, file_id=out_id, s3_dir=inputs.output_s3_dir, file_name=fname, key_path=inputs.ssec, cores=inputs.cores)
def _testExternal(self, moduleName, pyFiles, virtualenv=False): dirPath = self._createTempDir() if virtualenv: self.assertTrue(inVirtualEnv()) # --never-download prevents silent upgrades to pip, wheel and setuptools check_call(['virtualenv', '--never-download', dirPath]) sitePackages = os.path.join(dirPath, 'lib', 'python2.7', 'site-packages') # tuple assignment is necessary to make this line immediately precede the try: oldPrefix, sys.prefix, dirPath = sys.prefix, dirPath, sitePackages else: oldPrefix = None try: pycFiles = set(pyFile + 'c' for pyFile in pyFiles) for relPath in pyFiles: path = os.path.join(dirPath, relPath) mkdir_p(os.path.dirname(path)) with open(path, 'w') as f: f.write('pass\n') sys.path.append(dirPath) try: userScript = importlib.import_module(moduleName) try: self._test(userScript.__name__, expectedContents=pycFiles, allowExtraContents=virtualenv) finally: del userScript while moduleName: del sys.modules[moduleName] self.assertFalse(moduleName in sys.modules) moduleName = '.'.join(moduleName.split('.')[:-1]) finally: sys.path.remove(dirPath) finally: if oldPrefix: sys.prefix = oldPrefix
def output_file_job(job, filename, file_id, output_dir, s3_key_path=None): """ Uploads a file from the FileStore to an output directory on the local filesystem or S3. :param JobFunctionWrappingJob job: passed automatically by Toil :param str filename: basename for file :param str file_id: FileStoreID :param str output_dir: Amazon S3 URL or local path :param str s3_key_path: (OPTIONAL) Path to 32-byte key to be used for SSE-C encryption :return: """ job.fileStore.logToMaster('Writing {} to {}'.format(filename, output_dir)) work_dir = job.fileStore.getLocalTempDir() filepath = job.fileStore.readGlobalFile(file_id, os.path.join(work_dir, filename)) if urlparse(output_dir).scheme == 's3': s3am_upload(fpath=os.path.join(work_dir, filepath), s3_dir=output_dir, s3_key_path=s3_key_path) elif os.path.exists(os.path.join(output_dir, filename)): job.fileStore.logToMaster("File already exists: {}".format(filename)) else: mkdir_p(output_dir) copy_files([filepath], output_dir)
def output_file_job(job, filename, file_id, output_dir, s3_key_path=None): """ Uploads a file from the FileStore to an output directory on the local filesystem or S3. :param JobFunctionWrappingJob job: passed automatically by Toil :param str filename: basename for file :param str file_id: FileStoreID :param str output_dir: Amazon S3 URL or local path :param str s3_key_path: (OPTIONAL) Path to 32-byte key to be used for SSE-C encryption :return: """ job.fileStore.logToMaster('Writing {} to {}'.format(filename, output_dir)) work_dir = job.fileStore.getLocalTempDir() filepath = job.fileStore.readGlobalFile(file_id, os.path.join(work_dir, filename)) if urlparse(output_dir).scheme == 's3': s3am_upload(job=job, fpath=os.path.join(work_dir, filepath), s3_dir=output_dir, s3_key_path=s3_key_path) elif os.path.exists(os.path.join(output_dir, filename)): job.fileStore.logToMaster("File already exists: {}".format(filename)) else: mkdir_p(output_dir) copy_files([filepath], output_dir)
def download_sample_and_align(job, sample, inputs, ids): """ Downloads the sample and runs BWA-kit :param JobFunctionWrappingJob job: Passed by Toil automatically :param tuple(str, list) sample: UUID and URLS for sample :param Namespace inputs: Contains input arguments :param dict ids: FileStore IDs for shared inputs """ uuid, urls = sample r1_url, r2_url = urls if len(urls) == 2 else (urls[0], None) job.fileStore.logToMaster('Downloaded sample: {0}. R1 {1}\nR2 {2}\nStarting BWA Run'.format(uuid, r1_url, r2_url)) # Read fastq samples from file store ids['r1'] = job.addChildJobFn(download_url_job, r1_url, s3_key_path=inputs.ssec, disk=inputs.file_size).rv() if r2_url: ids['r2'] = job.addChildJobFn(download_url_job, r2_url, s3_key_path=inputs.ssec, disk=inputs.file_size).rv() else: ids['r2'] = None # Create config for bwakit inputs.cores = min(inputs.maxCores, multiprocessing.cpu_count()) inputs.uuid = uuid config = dict(**vars(inputs)) # Create config as a copy of inputs since it has values we want config.update(ids) # Overwrite attributes with the FileStoreIDs from ids config = argparse.Namespace(**config) # Define and wire job functions bam_id = job.wrapJobFn(run_bwakit, config, sort=inputs.sort, trim=inputs.trim, disk=inputs.file_size, cores=inputs.cores) job.addFollowOn(bam_id) output_name = uuid + '.bam' + str(inputs.suffix) if inputs.suffix else uuid + '.bam' if urlparse(inputs.output_dir).scheme == 's3': bam_id.addChildJobFn(s3am_upload_job, file_id=bam_id.rv(), file_name=output_name, s3_dir=inputs.output_dir, s3_key_path=inputs.ssec, cores=inputs.cores, disk=inputs.file_size) else: mkdir_p(inputs.ouput_dir) bam_id.addChildJobFn(copy_file_job, name=output_name, file_id=bam_id.rv(), output_dir=inputs.output_dir, disk=inputs.file_size)
def main(): """ Computational Genomics Lab, Genomics Institute, UC Santa Cruz MarginPhase pipeline ======================================= Dependencies Curl: apt-get install curl Docker: wget -qO- https://get.docker.com/ | sh Toil: pip install toil Boto: pip install boto (OPTIONAL) """ parser = argparse.ArgumentParser( description=main.__doc__, formatter_class=argparse.RawTextHelpFormatter) subparsers = parser.add_subparsers(dest='command') # Generate subparsers subparsers.add_parser( 'generate-config', help='Generates an editable config in the current working directory.') subparsers.add_parser( 'generate-manifest', help='Generates an editable manifest in the current working directory.' ) subparsers.add_parser( 'generate', help='Generates a config and manifest in the current working directory.' ) # Run subparser parser_run = subparsers.add_parser('run', help='Runs the MarginPhase pipeline') group = parser_run.add_mutually_exclusive_group() parser_run.add_argument( '--config', default=DEFAULT_CONFIG_NAME, type=str, help= 'Path to the (filled in) config file, generated with "generate-config". ' '\nDefault value: "%(default)s"') group.add_argument( '--manifest', default=DEFAULT_MANIFEST_NAME, type=str, help= 'Path to the (filled in) manifest file, generated with "generate-manifest". ' '\nDefault value: "%(default)s"') # If no arguments provided, print full help menu if len(sys.argv) == 1: parser.print_help() sys.exit(1) # Add Toil options Job.Runner.addToilOptions(parser_run) args = parser.parse_args() # Parse subparsers related to generation of config and manifest cwd = os.getcwd() if args.command == 'generate-config' or args.command == 'generate': generate_file(os.path.join(cwd, DEFAULT_CONFIG_NAME), generate_config) if args.command == 'generate-manifest' or args.command == 'generate': generate_file(os.path.join(cwd, DEFAULT_MANIFEST_NAME), generate_manifest) # Pipeline execution elif args.command == 'run': # sanity check require( os.path.exists(args.config), '{} not found. Please run ' '"toil-marginphase generate-config"'.format(args.config)) require( os.path.exists(args.manifest), '{} not found and no samples provided. Please ' 'run "toil-marginphase generate-manifest"'.format(args.manifest)) # Parse config parsed_config = { x.replace('-', '_'): y for x, y in yaml.load(open(args.config).read()).iteritems() } config = argparse.Namespace(**parsed_config) config.maxCores = int(args.maxCores) if args.maxCores else sys.maxsize config.defaultCores = int(min(MP_CPU, config.maxCores)) config.maxDisk = int(args.maxDisk) if args.maxDisk else sys.maxint config.maxMemory = sys.maxint # fix parsing of GB to int if args.maxMemory: args.maxMemory = args.maxMemory.upper() if args.maxMemory.endswith('B'): args.maxMemory = args.maxMemory.rstrip('B') # actual parsing if args.maxMemory.endswith('G'): config.maxMemory = int( args.maxMemory.rstrip('G')) * 1024 * 1024 * 1024 elif args.maxMemory.endswith('M'): config.maxMemory = int( args.maxMemory.rstrip('M')) * 1024 * 1024 elif args.maxMemory.endswith('K'): config.maxMemory = int(args.maxMemory.rstrip('K')) * 1024 else: config.maxMemory = int(args.maxMemory) # Config sanity checks require(config.output_dir, 'No output location specified') if urlparse(config.output_dir).scheme != "s3": config.output_dir = config.output_dir.replace("file://", "", 1) mkdir_p(config.output_dir) if not config.output_dir.endswith('/'): config.output_dir += '/' require(config.partition_size, "Configuration parameter partition-size is required") require(config.partition_margin, "Configuration parameter partition-margin is required") if 'save_intermediate_files' not in config or not config.save_intermediate_files: config.intermediate_file_location = None elif urlparse(config.output_dir).scheme == "s3": raise UserError( "Config parameter 'save_intermediate_files' cannot be used with s3 output directory" ) else: intermediate_location = os.path.join( config.output_dir, "intermediate", datetime.datetime.now().strftime("%Y%m%d_%H%M%S")) mkdir_p(intermediate_location) config.intermediate_file_location = intermediate_location if "margin_phase_image" not in config or len( config.margin_phase_image) == 0: config.margin_phase_image = DOCKER_MARGIN_PHASE_IMG_DEFAULT if "margin_phase_tag" not in config or len( config.margin_phase_tag) == 0: config.margin_phase_tag = DOCKER_MARGIN_PHASE_TAG_DEFAULT if "cpecan_image" not in config or len(config.cpecan_image) == 0: config.cpecan_image = DOCKER_CPECAN_IMG_DEFAULT if "cpecan_tag" not in config or len(config.cpecan_tag) == 0: config.cpecan_tag = DOCKER_CPECAN_TAG_DEFAULT if "unittest" not in config: config.unittest = False if "minimal_output" not in config: config.minimal_output = False if "minimal_cpecan_output" not in config: config.minimal_cpecan_output = False if "cpecan_probabilities" not in config: config.cpecan_probabilities = False # get samples samples = parse_samples(config, args.manifest) # Program checks for program in ['docker']: require( next(which(program), None), program + ' must be installed on every node.'.format(program)) # Start the workflow Job.Runner.startToil( Job.wrapJobFn(map_job, prepare_input, samples, config), args)
def consolidate_output(job, config, chunk_infos): #prep start = time.time() uuid = config.uuid work_dir = job.fileStore.getLocalTempDir() out_tar = os.path.join(work_dir, '{}.tar.gz'.format(config.uuid)) log(job, "{}".format(datetime.datetime.now()), uuid, 'consolidate_output') log(job, "consolidating {} files".format(len(chunk_infos)), uuid, 'consolidate_output') # build tarball out_tars = [out_tar] output_file_count = 0 with tarfile.open(out_tar, 'w:gz') as f_out: for ci in chunk_infos: file_id = ci[CI_OUTPUT_FILE_ID] tar_file = os.path.join(work_dir, "{}.tar.gz".format(ci[CI_CHUNK_INDEX])) job.fileStore.readGlobalFile(file_id, tar_file) out_tars.append(tar_file) with tarfile.open(tar_file, 'r') as f_in: for tarinfo in f_in: if config.minimal_output and ( (tarinfo.name.endswith("bam") or tarinfo.name.endswith("sam") or tarinfo.name.endswith("bai")) and ID_MERGED not in tarinfo.name): log( job, "(Minimal Output) Skipping output file: {}".format( tarinfo.name), uuid, 'consolidate_output') continue if config.minimal_cpecan_output and tarinfo.name.endswith( "gz"): log( job, "(Minimal cPecan Output) Skipping output file: {}". format(tarinfo.name), uuid, 'consolidate_output') continue log(job, "file {}".format(tarinfo.name), uuid, 'consolidate_output') with closing(f_in.extractfile(tarinfo)) as f_in_file: f_out.addfile(tarinfo, fileobj=f_in_file) output_file_count += 1 log( job, "Consolidated {} files in {} tarballs".format(output_file_count, len(out_tars)), uuid, 'consolidate_output') # Move to output location if urlparse(config.output_dir).scheme == 's3': log(job, "Uploading {} to S3: {}".format(out_tar, config.output_dir), uuid, 'consolidate_output') s3am_upload(fpath=out_tar, s3_dir=config.output_dir, num_cores=config.maxCores) else: log(job, "Moving {} to output dir: {}".format(out_tar, config.output_dir), uuid, 'consolidate_output') mkdir_p(config.output_dir) copy_files(file_paths=[out_tar], output_dir=config.output_dir) # log log_generic_job_debug(job, config.uuid, "consolidate_output", work_dir=work_dir) log_time(job, "consolidate_output", start, config.uuid) log(job, "{}".format(datetime.datetime.now()), uuid, 'END') # return location (calculated the same whether s3:// or file:// return os.path.join(config.output_dir, os.path.basename(out_tar))
def prepare_input(job, sample, config, enqueue_consolidation=True): # job prep config = argparse.Namespace(**vars(config)) uuid, url, contig_name, reference_url, params_url = sample config.uuid = uuid config.contig_name = contig_name config.reference_url = reference_url config.params_url = params_url if config.intermediate_file_location is not None: config.intermediate_file_location = os.path.join( config.intermediate_file_location, uuid) mkdir_p(config.intermediate_file_location) work_dir = job.fileStore.getLocalTempDir() start = time.time() log(job, "{}".format(datetime.datetime.now()), config.uuid, 'START') log( job, "Preparing input with URL:{}, contig:{}, reference_url:{}, params_url:{}" .format(url, contig_name, reference_url, params_url), uuid, 'prepare_input') # todo global resource estimation config.maxCores = min(config.maxCores, multiprocessing.cpu_count()) config.defaultCores = min(MP_CPU, config.maxCores) config.maxMemory = min(config.maxMemory, int(physicalMemory() * .95)) #config.disk # download references - TOIL_JOBSTORE_PROTOCOL queries are so this function can be imported #ref fasta if reference_url.startswith(TOIL_JOBSTORE_PROTOCOL): ref_genome_fileid = reference_url.replace(TOIL_JOBSTORE_PROTOCOL, '', 1) ref_genome_filename = "{}.reference.{}.fa".format(uuid, contig_name) job.fileStore.readGlobalFile( ref_genome_fileid, os.path.join(work_dir, ref_genome_filename)) else: download_url(reference_url, work_dir=work_dir) ref_genome_filename = os.path.basename(reference_url) ref_genome_fileid = job.fileStore.writeGlobalFile( os.path.join(work_dir, ref_genome_filename)) ref_genome_size = os.stat(os.path.join(work_dir, ref_genome_filename)).st_size config.reference_genome_fileid = ref_genome_fileid #params if params_url.startswith(TOIL_JOBSTORE_PROTOCOL): params_fileid = params_url.replace(TOIL_JOBSTORE_PROTOCOL, '', 1) else: download_url(params_url, work_dir=work_dir) params_filename = os.path.basename(params_url) params_fileid = job.fileStore.writeGlobalFile( os.path.join(work_dir, params_filename)) config.params_fileid = params_fileid # download bam if url.startswith(TOIL_JOBSTORE_PROTOCOL): bam_filename = "{}.input.{}.bam".format(uuid, contig_name) job.fileStore.readGlobalFile( url.replace(TOIL_JOBSTORE_PROTOCOL, '', 1), os.path.join(work_dir, bam_filename)) else: download_url(url, work_dir=work_dir) bam_filename = os.path.basename(url) data_bam_location = os.path.join("/data", bam_filename) workdir_bam_location = os.path.join(work_dir, bam_filename) # index the bam _index_bam(job, config, work_dir, bam_filename) # sanity check workdir_bai_location = os.path.join(work_dir, bam_filename + ".bai") if not os.path.isfile(workdir_bai_location): raise UserError("BAM index file not created for {}: {}".format( bam_filename, workdir_bai_location)) # get start and end location start_idx = sys.maxint end_idx = 0 with closing( pysam.AlignmentFile( workdir_bam_location, 'rb' if bam_filename.endswith("bam") else 'r')) as aln: for read in aln.fetch(): align_start = read.reference_start align_end = read.reference_end start_idx = min([start_idx, align_start]) end_idx = max([end_idx, align_end]) log(job, "start_pos:{}, end_pos:{}".format(config.uuid, start_idx, end_idx), uuid, 'prepare_input') # get reads from positions chunk_infos = list() idx = start_idx while idx < end_idx: ci = {CI_UUID: uuid} ci[CI_CHUNK_BOUNDARY_START] = idx chunk_start = idx - config.partition_margin ci[CI_CHUNK_START] = chunk_start idx += config.partition_size ci[CI_CHUNK_BOUNDARY_END] = idx chunk_end = idx + config.partition_margin ci[CI_CHUNK_END] = chunk_end chunk_infos.append(ci) # enqueue jobs log(job, "Enqueueing {} jobs".format(len(chunk_infos)), uuid, 'prepare_input') idx = 0 enqueued_jobs = 0 returned_tarballs = list() for ci in chunk_infos: #prep ci[CI_CHUNK_INDEX] = idx chunk_start = ci[CI_CHUNK_START] chunk_end = ci[CI_CHUNK_END] chunk_position_description = "{}:{}-{}".format(config.contig_name, chunk_start, chunk_end) bam_split_command = [ "view", "-b", data_bam_location, chunk_position_description ] chunk_name = "{}.{}.bam".format(config.uuid, idx) #write chunk chunk_location = os.path.join(work_dir, chunk_name) with open(chunk_location, 'w') as out: docker_call(job, config, work_dir, bam_split_command, DOCKER_SAMTOOLS_IMG, DOCKER_SAMTOOLS_TAG, outfile=out) #document read count chunk_size = os.stat(chunk_location).st_size ci[CI_CHUNK_SIZE] = chunk_size ci[CI_REF_FA_SIZE] = ref_genome_size read_count = prepare_input__get_bam_read_count(job, work_dir, chunk_name) ci[CI_READ_COUNT] = read_count log( job, "chunk from {} for idx {} is {}b ({}mb) and has {} reads".format( chunk_position_description, idx, chunk_size, int(chunk_size / 1024 / 1024), read_count), uuid, 'prepare_input') if config.intermediate_file_location is not None: copy_files(file_paths=[chunk_location], output_dir=config.intermediate_file_location) # enqueue marginPhase job if read_count > 0: chunk_fileid = job.fileStore.writeGlobalFile(chunk_location) mp_cores = config.defaultCores mp_mem = int( min( int(chunk_size * MP_MEM_BAM_FACTOR + ref_genome_size * MP_MEM_REF_FACTOR), config.maxMemory)) mp_disk = int( min( int(chunk_size * MP_DSK_BAM_FACTOR + ref_genome_size * MP_DSK_REF_FACTOR + (0 if config.cpecan_probabilities else MP_DSK_CPECAN_FACTOR) * chunk_size), config.maxDisk)) log( job, "requesting {} cores, {}b ({}mb) disk, {}b ({}gb) mem".format( mp_cores, mp_disk, int(mp_disk / 1024 / 1024), mp_mem, int(mp_mem / 1024 / 1024 / 1024)), "{}.{}".format(uuid, idx), 'prepare_input') mp_mem = str(int(mp_mem / 1024)) + "K" mp_disk = str(int(mp_disk) / 1024) + "K" margin_phase_job = job.addChildJobFn(run_margin_phase, config, chunk_fileid, ci, memory=mp_mem, cores=mp_cores, disk=mp_disk) returned_tarballs.append(margin_phase_job.rv()) enqueued_jobs += 1 idx += 1 log(job, "Enqueued {} jobs".format(enqueued_jobs), uuid, 'prepare_input') # enqueue merging and consolidation job merge_job = job.addFollowOnJobFn(merge_chunks, config, returned_tarballs) final_return_value = merge_job.rv() if enqueue_consolidation: consolidation_job = merge_job.addFollowOnJobFn(consolidate_output, config, merge_job.rv()) final_return_value = consolidation_job.rv() # log log_generic_job_debug(job, config.uuid, 'prepare_input', work_dir=work_dir) log_time(job, "prepare_input", start, config.uuid) # return appropriate output return final_return_value
def testDockerClean(self, disableCaching=True, detached=True, rm=True, deferParam=None): """ Run the test container that creates a file in the work dir, and sleeps for 5 minutes. Ensure that the calling job gets SIGKILLed after a minute, leaving behind the spooky/ghost/zombie container. Ensure that the container is killed on batch system shutdown (through the deferParam mechanism). """ # We need to test the behaviour of `deferParam` with `rm` and # `detached`. We do not look at the case where `rm` and `detached` are # both True. This is the truth table for the different combinations at # the end of the test. R = Running, X = Does not exist, E = Exists but # not running. # None FORGO STOP RM # rm X R X X # detached R R E X # Neither R R E X data_dir = os.path.join(self.tempDir, 'data') working_dir = os.path.join(self.tempDir, 'working') test_file = os.path.join(working_dir, 'test.txt') mkdir_p(data_dir) mkdir_p(working_dir) options = Job.Runner.getDefaultOptions( os.path.join(self.tempDir, 'jobstore')) options.logLevel = self.dockerTestLogLevel options.workDir = working_dir options.clean = 'always' options.disableCaching = disableCaching # No base64 logic since it might create a name starting with a `-`. container_name = uuid.uuid4().hex A = Job.wrapJobFn(_testDockerCleanFn, working_dir, detached, rm, deferParam, container_name) try: Job.Runner.startToil(A, options) except FailedJobsException: # The file created by spooky_container would remain in the directory # and since it was created inside the container, it would have had # uid and gid == 0 (root) which may cause problems when docker # attempts to clean up the jobstore. file_stats = os.stat(test_file) assert file_stats.st_gid != 0 assert file_stats.st_uid != 0 if (rm and (deferParam != FORGO)) or deferParam == RM: # These containers should not exist assert containerIsRunning(container_name) is None, \ 'Container was not removed.' elif deferParam == STOP: # These containers should exist but be non-running assert containerIsRunning(container_name) == False, \ 'Container was not stopped.' else: # These containers will be running assert containerIsRunning(container_name) == True, \ 'Container was not running.' client = docker.from_env(version='auto') dockerKill(container_name, client) try: os.remove(test_file) except: pass
def _populate_keys_from_metadata_server(self): global _populate_keys_from_metadata_server_orig path = os.path.expanduser(cache_path) tmp_path = path + '.tmp' while True: log.debug('Attempting to read cached credentials from %s.', path) try: with open(path, 'r') as f: content = f.read() if content: record = content.split('\n') assert len(record) == 4 self._access_key = record[0] self._secret_key = record[1] self._security_token = record[2] self._credential_expiry_time = str_to_datetime(record[3]) else: log.debug('%s is empty. Credentials are not temporary.', path) return except IOError as e: if e.errno == errno.ENOENT: log.debug('Cached credentials are missing.') dir_path = os.path.dirname(path) if not os.path.exists(dir_path): log.debug('Creating parent directory %s', dir_path) # A race would be ok at this point mkdir_p(dir_path) else: raise else: if self._credentials_need_refresh(): log.debug('Cached credentials are expired.') else: log.debug('Cached credentials exist and are still fresh.') return # We get here if credentials are missing or expired log.debug('Racing to create %s.', tmp_path) # Only one process, the winner, will succeed try: fd = os.open(tmp_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0600) except OSError as e: if e.errno == errno.EEXIST: log.debug( 'Lost the race to create %s. Waiting on winner to remove it.', tmp_path) while os.path.exists(tmp_path): time.sleep(.1) log.debug('Winner removed %s. Trying from the top.', tmp_path) else: raise else: try: log.debug( 'Won the race to create %s. ' 'Requesting credentials from metadata service.', tmp_path) _populate_keys_from_metadata_server_orig(self) except: os.close(fd) fd = None log.debug('Failed to obtain credentials, removing %s.', tmp_path) # This unblocks the loosers. os.unlink(tmp_path) # Bail out. It's too likely to happen repeatedly raise else: if self._credential_expiry_time is None: os.close(fd) fd = None log.debug( 'Credentials are not temporary. ' 'Leaving %s empty and renaming it to %s.', tmp_path, path) else: log.debug('Writing credentials to %s.', tmp_path) with os.fdopen(fd, 'w') as fh: fd = None fh.write('\n'.join([ self._access_key, self._secret_key, self._security_token, datetime_to_str(self._credential_expiry_time) ])) log.debug('Wrote credentials to %s. ' 'Renaming it to %s.', tmp_path, path) os.rename(tmp_path, path) return finally: if fd is not None: os.close(fd)
def testDockerClean(self, caching=True): """ Run the test container that creates a file in the work dir, and sleeps for 5 minutes. Ensure that the calling job gets SIGKILLed after a minute, leaving behind the spooky/ghost/zombie container. Ensure that the container is killed on batch system shutdown (through the defer mechanism). This inherently also tests _docker :returns: None """ # We need to test the behaviour of `defer` with `rm` and `detached`. We do not look at the case # where `rm` and `detached` are both True. This is the truth table for the different # combinations at the end of the test. R = Running, X = Does not exist, E = Exists but not # running. # None FORGO STOP RM # rm X R X X # detached R R E X # Neither R R E X assert os.getuid() != 0, "Cannot test this if the user is root." data_dir = os.path.join(self.tempDir, 'data') work_dir = os.path.join(self.tempDir, 'working') test_file = os.path.join(data_dir, 'test.txt') mkdir_p(data_dir) mkdir_p(work_dir) options = Job.Runner.getDefaultOptions(os.path.join(self.tempDir, 'jobstore')) options.logLevel = 'INFO' options.workDir = work_dir options.clean = 'always' if not caching: options.disableCaching = True for rm in (True, False): for detached in (True, False): if detached and rm: continue for defer in (FORGO, STOP, RM, None): # Not using base64 logic here since it might create a name starting with a `-`. container_name = uuid.uuid4().hex A = Job.wrapJobFn(_testDockerCleanFn, data_dir, detached, rm, defer, container_name) try: Job.Runner.startToil(A, options) except FailedJobsException: # The file created by spooky_container would remain in the directory, and since # it was created inside the container, it would have had uid and gid == 0 (root) # upon creation. If the defer mechanism worked, it should now be non-zero and we # check for that. file_stats = os.stat(test_file) assert file_stats.st_gid != 0 assert file_stats.st_uid != 0 if (rm and defer != FORGO) or defer == RM: # These containers should not exist assert _containerIsRunning(container_name) is None, \ 'Container was not removed.' elif defer == STOP: # These containers should exist but be non-running assert _containerIsRunning(container_name) == False, \ 'Container was not stopped.' else: # These containers will be running assert _containerIsRunning(container_name) == True, \ 'Container was not running.' finally: # Prepare for the next test. _dockerKill(container_name, RM) os.remove(test_file)
def consolidate_output(job, config, kallisto_output, rsem_star_output, fastqc_output): """ Combines the contents of the outputs into one tarball and places in output directory or s3 :param JobFunctionWrappingJob job: passed automatically by Toil :param Namespace config: Argparse Namespace object containing argument inputs :param FileID kallisto_output: FileStoreID for Kallisto output :param tuple(FileID, FileID, FileID)|tuple(FileID, FileID, FileID, bool, FileID) rsem_star_output: FileStoreIDs for RSEM and STAR output, and a flag/FileID if run with bamQC :param FileID fastqc_output: FileStoreID for FastQC output """ job.fileStore.logToMaster('Consolidating output: {}'.format(config.uuid)) work_dir = job.fileStore.getLocalTempDir() config.uuid = 'SINGLE-END.' + config.uuid if not config.paired else config.uuid # Retrieve output file paths to consolidate rsem_tar, hugo_tar, kallisto_tar, fastqc_tar, bamqc_tar, star_tar = None, None, None, None, None, None if rsem_star_output: if config.bamqc: rsem_id, hugo_id, star_id, fail_flag, bamqc_id = flatten( rsem_star_output) bamqc_tar = job.fileStore.readGlobalFile( bamqc_id, os.path.join(work_dir, 'bamqc.tar.gz')) config.uuid = 'FAIL.' + config.uuid if fail_flag else config.uuid else: rsem_id, hugo_id, star_id = flatten(rsem_star_output) rsem_tar = job.fileStore.readGlobalFile( rsem_id, os.path.join(work_dir, 'rsem.tar.gz')) hugo_tar = job.fileStore.readGlobalFile( hugo_id, os.path.join(work_dir, 'rsem_hugo.tar.gz')) star_tar = job.fileStore.readGlobalFile( star_id, os.path.join(work_dir, 'star.tar.gz')) if kallisto_output: kallisto_tar = job.fileStore.readGlobalFile( kallisto_output, os.path.join(work_dir, 'kallisto.tar.gz')) if fastqc_output: fastqc_tar = job.fileStore.readGlobalFile( fastqc_output, os.path.join(work_dir, 'fastqc.tar.gz')) # I/O out_tar = os.path.join(work_dir, config.uuid + '.tar.gz') # Consolidate separate tarballs into one as streams (avoids unnecessary untaring) tar_list = [ x for x in [rsem_tar, hugo_tar, kallisto_tar, fastqc_tar, bamqc_tar, star_tar] if x is not None ] with tarfile.open(out_tar, 'w:gz') as f_out: for tar in tar_list: with tarfile.open(tar, 'r') as f_in: for tarinfo in f_in: with closing(f_in.extractfile(tarinfo)) as f_in_file: if tar == rsem_tar: tarinfo.name = os.path.join( config.uuid, 'RSEM', os.path.basename(tarinfo.name)) elif tar == hugo_tar: tarinfo.name = os.path.join( config.uuid, 'RSEM', 'Hugo', os.path.basename(tarinfo.name)) elif tar == kallisto_tar: tarinfo.name = os.path.join( config.uuid, 'Kallisto', os.path.basename(tarinfo.name)) elif tar == bamqc_tar: tarinfo.name = os.path.join( config.uuid, 'QC', 'bamQC', os.path.basename(tarinfo.name)) elif tar == fastqc_tar: tarinfo.name = os.path.join( config.uuid, 'QC', 'fastQC', os.path.basename(tarinfo.name)) elif tar == star_tar: tarinfo.name = os.path.join( config.uuid, 'QC', 'STAR', os.path.basename(tarinfo.name)) f_out.addfile(tarinfo, fileobj=f_in_file) # Move to output location if urlparse(config.output_dir).scheme == 's3': job.fileStore.logToMaster('Uploading {} to S3: {}'.format( config.uuid, config.output_dir)) s3am_upload(fpath=out_tar, s3_dir=config.output_dir, num_cores=config.cores) else: job.fileStore.logToMaster('Moving {} to output dir: {}'.format( config.uuid, config.output_dir)) mkdir_p(config.output_dir) copy_files( file_paths=[os.path.join(work_dir, config.uuid + '.tar.gz')], output_dir=config.output_dir)