def test_upload_and_download_with_encryption(tmpdir): from toil_scripts.lib.urls import s3am_upload from toil_scripts.lib.urls import download_url from boto.s3.connection import S3Connection, Bucket, Key work_dir = str(tmpdir) # Create temporary encryption key key_path = os.path.join(work_dir, 'foo.key') subprocess.check_call(['dd', 'if=/dev/urandom', 'bs=1', 'count=32', 'of={}'.format(key_path)]) # Create test file upload_fpath = os.path.join(work_dir, 'upload_file') with open(upload_fpath, 'wb') as fout: fout.write(os.urandom(1024)) # Upload file s3_dir = 's3://cgl-driver-projects/test' s3am_upload(fpath=upload_fpath, s3_dir=s3_dir, s3_key_path=key_path) # Download the file url = 'https://s3-us-west-2.amazonaws.com/cgl-driver-projects/test/upload_file' download_url(url=url, name='download_file', work_dir=work_dir, s3_key_path=key_path) download_fpath = os.path.join(work_dir, 'download_file') assert os.path.exists(download_fpath) assert filecmp.cmp(upload_fpath, download_fpath) # Delete the Key conn = S3Connection() b = Bucket(conn, 'cgl-driver-projects') k = Key(b) k.key = 'test/upload_file' k.delete()
def rsem_quantification(job, config, star_output): """ Unpack STAR results and run RSEM (and saving BAM from STAR) :param JobFunctionWrappingJob job: passed automatically by Toil :param Namespace config: Argparse Namespace object containing argument inputs :param tuple(str, str) star_output: FileStoreIDs from STARs output :return: FileStoreID results from RSEM postprocess :rtype: str """ cores = min(16, config.cores) disk = '2G' if config.ci_test else '40G' transcriptome_id, sorted_id = star_output # Save sorted bam if flag is selected if config.save_bam: work_dir = job.fileStore.getLocalTempDir() bam_path = os.path.join(work_dir, '{}.sorted.bam'.format(config.uuid)) sorted_bam = job.fileStore.readGlobalFile(sorted_id, bam_path) if config.s3_output_dir and config.ssec: s3am_upload(fpath=sorted_bam, s3_dir=config.s3_output_dir, s3_key_path=config.ssec) if config.output_dir: move_files(file_paths=[sorted_bam], output_dir=config.output_dir) # Declare RSEM and RSEM post-process jobs rsem_output = job.wrapJobFn(run_rsem, config.cores, transcriptome_id, config.rsem_ref, paired=config.paired, cores=cores, disk=disk) rsem_postprocess = job.wrapJobFn(run_rsem_postprocess, config.uuid, rsem_output.rv(0), rsem_output.rv(1)) job.addChild(rsem_output) rsem_output.addChild(rsem_postprocess) return rsem_postprocess.rv()
def test_upload_and_download_with_encryption(tmpdir): from toil_scripts.lib.urls import s3am_upload from toil_scripts.lib.urls import download_url from boto.s3.connection import S3Connection, Bucket, Key work_dir = str(tmpdir) # Create temporary encryption key key_path = os.path.join(work_dir, 'foo.key') subprocess.check_call(['dd', 'if=/dev/urandom', 'bs=1', 'count=32', 'of={}'.format(key_path)]) # Create test file upload_fpath = os.path.join(work_dir, 'upload_file') with open(upload_fpath, 'wb') as fout: fout.write(os.urandom(1024)) # Upload file random_key = os.path.join('test/', str(uuid4()), 'upload_file') s3_url = os.path.join('s3://cgl-driver-projects/', random_key) try: s3_dir = os.path.split(s3_url)[0] s3am_upload(fpath=upload_fpath, s3_dir=s3_dir, s3_key_path=key_path) # Download the file download_url(url=s3_url, name='download_file', work_dir=work_dir, s3_key_path=key_path) download_fpath = os.path.join(work_dir, 'download_file') assert os.path.exists(download_fpath) assert filecmp.cmp(upload_fpath, download_fpath) finally: # Delete the Key. Key deletion never fails so we don't need to catch any exceptions with closing(S3Connection()) as conn: b = Bucket(conn, 'cgl-driver-projects') k = Key(b) k.key = random_key k.delete()
def upload_or_move(job, work_dir, output_dir, output, ssec=None): # are we moving this into a local dir, or up to s3? if output_dir.startswith('s3://'): s3am_upload(fpath=os.path.join(work_dir, output), s3_dir=output_dir, s3_key_path=ssec) else: # FIXME: undefined function make_directory(output_dir) move_to_output_dir(work_dir, output_dir, output)
def upload_or_move_hc(work_dir, output_dir, output, ssec=None): # are we moving this into a local dir, or up to s3? if output_dir.startswith('s3://'): #if ssec is None: # raise ValueError('s3 output_dir provided, but ssec is missing') s3am_upload(fpath=os.path.join(work_dir, output), s3_dir=output_dir, s3_key_path=ssec) else: # FIXME: undefined function make_directory(output_dir) move_to_output_dir(work_dir, output_dir, output)
def run_bwa(job, inputs, ids): """ Aligns two fastqs into a BAMFILE via BWA :param JobFunctionWrappingJob job: Passed by Toil automatically :param Namespace inputs: Input arguments (see main) :param list ids: list of FileStore IDs (R1, R2, reference inputs) """ work_dir = job.fileStore.getLocalTempDir() file_names = ['r1.fq.gz', 'r2.fq.gz', 'ref.fa', 'ref.fa.amb', 'ref.fa.ann', 'ref.fa.bwt', 'ref.fa.pac', 'ref.fa.sa', 'ref.fa.fai'] if inputs.alt: file_names.append('ref.fa.alt') for fileStoreID, name in zip(ids, file_names): job.fileStore.readGlobalFile(fileStoreID, os.path.join(work_dir, name)) # Add read group line rg = "@RG\\tID:{0}\\tLB:{1}\\tPL:{2}\\tPU:{3}\\tSM:{0}".format(inputs.uuid, inputs.library, inputs.platform, inputs.program_unit) # BWA Options opt_args = [] if not inputs.skip_sort: opt_args.append('-s') if inputs.trim: opt_args.append('-a') # Call: bwakit parameters = (['-t', str(inputs.cores), '-R', rg] + opt_args + ['-o', '/data/aligned', '/data/ref.fa', '/data/r1.fq.gz', '/data/r2.fq.gz']) outputs = {'aligned.aln.bam': inputs.mock_bam} docker_call(tool='quay.io/ucsc_cgl/bwakit:0.7.12--528bb9bf73099a31e74a7f5e6e3f2e0a41da486e', parameters=parameters, inputs=file_names, outputs=outputs, work_dir=work_dir) # BWA insists on adding an `*.aln.sam` suffix, so rename the output file output_file = os.path.join(work_dir, '{}.bam'.format(inputs.uuid)) os.rename(os.path.join(work_dir, 'aligned.aln.bam'), output_file) # Either write file to local output directory or upload to S3 cloud storage job.fileStore.logToMaster('Aligned sample: {}'.format(inputs.uuid)) if inputs.output_dir: move_files([output_file], inputs.output_dir) if inputs.s3_dir: s3am_upload(output_file, inputs.s3_dir, s3_key_path=inputs.ssec)
def consolidate_output(job, config, kallisto_output, rsem_output, fastqc_output): """ Combines the contents of the outputs into one tarball and places in output directory or s3 :param JobFunctionWrappingJob job: passed automatically by Toil :param Namespace config: Argparse Namespace object containing argument inputs :param str kallisto_output: FileStoreID for Kallisto output :param tuple(str, str) rsem_output: FileStoreIDs for RSEM output :param str fastqc_output: FileStoreID for FastQC output """ job.fileStore.logToMaster('Consolidating input: {}'.format(config.uuid)) work_dir = job.fileStore.getLocalTempDir() # Retrieve output file paths to consolidate rsem_tar, hugo_tar, kallisto_tar, fastqc_tar = None, None, None, None if rsem_output: rsem_id, hugo_id = rsem_output rsem_tar = job.fileStore.readGlobalFile(rsem_id, os.path.join(work_dir, 'rsem.tar.gz')) hugo_tar = job.fileStore.readGlobalFile(hugo_id, os.path.join(work_dir, 'rsem_hugo.tar.gz')) if kallisto_output: kallisto_tar = job.fileStore.readGlobalFile(kallisto_output, os.path.join(work_dir, 'kallisto.tar.gz')) if fastqc_output: fastqc_tar = job.fileStore.readGlobalFile(fastqc_output, os.path.join(work_dir, 'fastqc.tar.gz')) # I/O if not config.paired: config.uuid = 'SINGLE-END.{}'.format(config.uuid) out_tar = os.path.join(work_dir, config.uuid + '.tar.gz') # Consolidate separate tarballs into one as streams (avoids unnecessary untaring) tar_list = [x for x in [rsem_tar, hugo_tar, kallisto_tar, fastqc_tar] if x is not None] with tarfile.open(os.path.join(work_dir, out_tar), 'w:gz') as f_out: for tar in tar_list: with tarfile.open(tar, 'r') as f_in: for tarinfo in f_in: with closing(f_in.extractfile(tarinfo)) as f_in_file: if tar == rsem_tar: tarinfo.name = os.path.join(config.uuid, 'RSEM', os.path.basename(tarinfo.name)) elif tar == hugo_tar: tarinfo.name = os.path.join(config.uuid, 'RSEM', 'Hugo', os.path.basename(tarinfo.name)) elif tar == kallisto_tar: tarinfo.name = os.path.join(config.uuid, 'Kallisto', os.path.basename(tarinfo.name)) else: tarinfo.name = os.path.join(config.uuid, 'QC', os.path.basename(tarinfo.name)) f_out.addfile(tarinfo, fileobj=f_in_file) # Move to output directory if config.output_dir: job.fileStore.logToMaster('Moving {} to output dir: {}'.format(config.uuid, config.output_dir)) mkdir_p(config.output_dir) copy_files(file_paths=[os.path.join(work_dir, config.uuid + '.tar.gz')], output_dir=config.output_dir) # Upload to S3 if config.s3_output_dir: job.fileStore.logToMaster('Uploading {} to S3: {}'.format(config.uuid, config.s3_output_dir)) s3am_upload(fpath=out_tar, s3_dir=config.s3_output_dir, num_cores=config.cores)
def upload_or_move(job, work_dir, input_args, output): # are we moving this into a local dir, or up to s3? if input_args['output_dir']: # get output path and output_dir = input_args['output_dir'] # FIXME: undefined function make_directory(output_dir) move_to_output_dir(work_dir, output_dir, output) elif input_args['s3_dir']: s3am_upload(fpath=os.path.join(work_dir, output), s3_dir=input_args['s3_dir'], s3_key_path=input_args['ssec']) else: raise ValueError('No output_directory or s3_dir defined. Cannot determine where to store %s' % output)
def consolidate_output(job, config, mutect, pindel, muse): """ Combine the contents of separate tarball outputs into one via streaming :param JobFunctionWrappingJob job: passed automatically by Toil :param Namespace config: Argparse Namespace object containing argument inputs :param str mutect: MuTect tarball FileStoreID :param str pindel: Pindel tarball FileStoreID :param str muse: MuSe tarball FileStoreID """ work_dir = job.fileStore.getLocalTempDir() mutect_tar, pindel_tar, muse_tar = None, None, None if mutect: mutect_tar = job.fileStore.readGlobalFile(mutect, os.path.join(work_dir, 'mutect.tar.gz')) if pindel: pindel_tar = job.fileStore.readGlobalFile(pindel, os.path.join(work_dir, 'pindel.tar.gz')) if muse: muse_tar = job.fileStore.readGlobalFile(muse, os.path.join(work_dir, 'muse.tar.gz')) out_tar = os.path.join(work_dir, config.uuid + '.tar.gz') # Consolidate separate tarballs into one as streams (avoids unnecessary untaring) tar_list = [x for x in [mutect_tar, pindel_tar, muse_tar] if x is not None] with tarfile.open(os.path.join(work_dir, out_tar), 'w:gz') as f_out: for tar in tar_list: with tarfile.open(tar, 'r') as f_in: for tarinfo in f_in: with closing(f_in.extractfile(tarinfo)) as f_in_file: if tar is mutect_tar: tarinfo.name = os.path.join(config.uuid, 'mutect', os.path.basename(tarinfo.name)) elif tar is pindel_tar: tarinfo.name = os.path.join(config.uuid, 'pindel', os.path.basename(tarinfo.name)) else: tarinfo.name = os.path.join(config.uuid, 'muse', os.path.basename(tarinfo.name)) f_out.addfile(tarinfo, fileobj=f_in_file) # Move to output location if urlparse(config.output_dir).scheme == 's3': job.fileStore.logToMaster('Uploading {} to S3: {}'.format(config.uuid, config.output_dir)) s3am_upload(fpath=out_tar, s3_dir=config.output_dir, num_cores=config.cores) else: job.fileStore.logToMaster('Moving {} to output dir: {}'.format(config.uuid, config.output_dir)) mkdir_p(config.output_dir) copy_files(file_paths=[out_tar], output_dir=config.output_dir)