def test_upload_and_download_with_encryption(tmpdir): from toil_lib.urls import s3am_upload from toil_lib.urls import download_url from boto.s3.connection import S3Connection, Bucket, Key work_dir = str(tmpdir) # Create temporary encryption key key_path = os.path.join(work_dir, 'foo.key') subprocess.check_call([ 'dd', 'if=/dev/urandom', 'bs=1', 'count=32', 'of={}'.format(key_path) ]) # Create test file upload_fpath = os.path.join(work_dir, 'upload_file') with open(upload_fpath, 'wb') as fout: fout.write(os.urandom(1024)) # Upload file random_key = os.path.join('test/', str(uuid4()), 'upload_file') s3_url = os.path.join('s3://cgl-driver-projects/', random_key) try: s3_dir = os.path.split(s3_url)[0] s3am_upload(fpath=upload_fpath, s3_dir=s3_dir, s3_key_path=key_path) # Download the file download_url(url=s3_url, name='download_file', work_dir=work_dir, s3_key_path=key_path) download_fpath = os.path.join(work_dir, 'download_file') assert os.path.exists(download_fpath) assert filecmp.cmp(upload_fpath, download_fpath) finally: # Delete the Key. Key deletion never fails so we don't need to catch any exceptions with closing(S3Connection()) as conn: b = Bucket(conn, 'cgl-driver-projects') k = Key(b) k.key = random_key k.delete()
def consolidate_output(job, config, mutect, pindel, muse): """ Combine the contents of separate tarball outputs into one via streaming :param JobFunctionWrappingJob job: passed automatically by Toil :param Namespace config: Argparse Namespace object containing argument inputs :param str mutect: MuTect tarball FileStoreID :param str pindel: Pindel tarball FileStoreID :param str muse: MuSe tarball FileStoreID """ work_dir = job.fileStore.getLocalTempDir() mutect_tar, pindel_tar, muse_tar = None, None, None if mutect: mutect_tar = job.fileStore.readGlobalFile( mutect, os.path.join(work_dir, 'mutect.tar.gz')) if pindel: pindel_tar = job.fileStore.readGlobalFile( pindel, os.path.join(work_dir, 'pindel.tar.gz')) if muse: muse_tar = job.fileStore.readGlobalFile( muse, os.path.join(work_dir, 'muse.tar.gz')) out_tar = os.path.join(work_dir, config.uuid + '.tar.gz') # Consolidate separate tarballs into one as streams (avoids unnecessary untaring) tar_list = [x for x in [mutect_tar, pindel_tar, muse_tar] if x is not None] with tarfile.open(os.path.join(work_dir, out_tar), 'w:gz') as f_out: for tar in tar_list: with tarfile.open(tar, 'r') as f_in: for tarinfo in f_in: with closing(f_in.extractfile(tarinfo)) as f_in_file: if tar is mutect_tar: tarinfo.name = os.path.join( config.uuid, 'mutect', os.path.basename(tarinfo.name)) elif tar is pindel_tar: tarinfo.name = os.path.join( config.uuid, 'pindel', os.path.basename(tarinfo.name)) else: tarinfo.name = os.path.join( config.uuid, 'muse', os.path.basename(tarinfo.name)) f_out.addfile(tarinfo, fileobj=f_in_file) # Move to output location if urlparse(config.output_dir).scheme == 's3': job.fileStore.logToMaster('Uploading {} to S3: {}'.format( config.uuid, config.output_dir)) s3am_upload(job=job, fpath=out_tar, s3_dir=config.output_dir, num_cores=config.cores) else: job.fileStore.logToMaster('Moving {} to output dir: {}'.format( config.uuid, config.output_dir)) mkdir_p(config.output_dir) copy_files(file_paths=[out_tar], output_dir=config.output_dir)
def rsem_quantification(job, config, star_output): """ Unpack STAR results and run RSEM (and saving BAM from STAR) :param JobFunctionWrappingJob job: passed automatically by Toil :param Namespace config: Argparse Namespace object containing argument inputs :param tuple(FileID, FileID, FileID, FileID)|tuple(FileID, FileID, FileID) star_output: FileStoreIDs from STAR :return: FileStoreID results from RSEM postprocess and STAR log :rtype: tuple(FileID, FileID, FileID) """ work_dir = job.fileStore.getLocalTempDir() cores = min(16, config.cores) if config.wiggle: transcriptome_id, sorted_id, wiggle_id, log_id = flatten(star_output) wiggle_path = os.path.join(work_dir, config.uuid + '.wiggle.bg') job.fileStore.readGlobalFile(wiggle_id, wiggle_path) if urlparse(config.output_dir).scheme == 's3': s3am_upload(fpath=wiggle_path, s3_dir=config.output_dir, s3_key_path=config.ssec) else: copy_files(file_paths=[wiggle_path], output_dir=config.output_dir) else: transcriptome_id, sorted_id, log_id = star_output # Save sorted bam if flag is selected if config.save_bam and not config.bamqc: # if config.bamqc is selected, bam is being saved in run_bam_qc bam_path = os.path.join(work_dir, config.uuid + '.sorted.bam') job.fileStore.readGlobalFile(sorted_id, bam_path) if urlparse(config.output_dir).scheme == 's3' and config.ssec: s3am_upload(fpath=bam_path, s3_dir=config.output_dir, s3_key_path=config.ssec) elif urlparse(config.output_dir).scheme != 's3': copy_files(file_paths=[bam_path], output_dir=config.output_dir) # Declare RSEM and RSEM post-process jobs disk = 5 * transcriptome_id.size rsem_output = job.wrapJobFn(run_rsem, transcriptome_id, config.rsem_ref, paired=config.paired, cores=cores, disk=disk) rsem_postprocess = job.wrapJobFn(run_rsem_postprocess, rsem_output.rv(0), rsem_output.rv(1)) job.addChild(rsem_output) rsem_output.addChild(rsem_postprocess) # Save STAR log log_path = os.path.join(work_dir, 'Log.final.out') job.fileStore.readGlobalFile(log_id, log_path) tarball_files(tar_name='star.tar.gz', file_paths=[log_path], output_dir=work_dir) star_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'star.tar.gz')) return rsem_postprocess.rv(), star_id
def run_bam_qc(job, aligned_bam_id, config): """ Run BAM QC as specified by California Kids Cancer Comparison (CKCC) :param JobFunctionWrappingJob job: :param str aligned_bam_id: FileStoreID of sorted bam from STAR :param Namespace config: Argparse Namespace object containing argument inputs Must contain: config.uuid str: UUID of input sample config.save_bam bool: True/False depending on whether to save bam config.output_dir str: Path to save bam config.ssec str: Path to encryption key for secure upload to S3 :return: boolean flag, FileStoreID for output bam, and FileStoreID for output tar :rtype: tuple(bool, str, str) """ work_dir = job.fileStore.getLocalTempDir() job.fileStore.readGlobalFile( aligned_bam_id, os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam')) docker_call(tool='hbeale/treehouse_bam_qc:1.0', work_dir=work_dir, parameters=['runQC.sh', str(job.cores)]) # Tar Output files output_names = [ 'readDist.txt', 'rnaAligned.out.md.sorted.geneBodyCoverage.curves.pdf', 'rnaAligned.out.md.sorted.geneBodyCoverage.txt' ] if os.path.exists(os.path.join(work_dir, 'readDist.txt_PASS_qc.txt')): output_names.append('readDist.txt_PASS_qc.txt') fail_flag = False else: output_names.append('readDist.txt_FAIL_qc.txt') fail_flag = True output_files = [os.path.join(work_dir, x) for x in output_names] tarball_files(tar_name='bam_qc.tar.gz', file_paths=output_files, output_dir=work_dir) # Save output BAM if config.save_bam: bam_path = os.path.join(work_dir, 'rnaAligned.sortedByCoord.md.bam') new_bam_path = os.path.join(work_dir, config.uuid + '.sortedByCoord.md.bam') os.rename(bam_path, new_bam_path) if urlparse(config.output_dir).scheme == 's3' and config.ssec: s3am_upload(fpath=new_bam_path, s3_dir=config.output_dir, s3_key_path=config.ssec) elif urlparse(config.output_dir).scheme != 's3': copy_files(file_paths=[new_bam_path], output_dir=config.output_dir) return fail_flag, job.fileStore.writeGlobalFile( os.path.join(work_dir, 'bam_qc.tar.gz'))
def consolidate_output(job, config, kallisto_output, graphical_output): """ Combines the contents of the outputs into one tarball and places in output directory or s3 :param JobFunctionWrappingJob job: passed automatically by Toil :param Namespace config: Argparse Namespace object containing argument inputs :param str kallisto_output: FileStoreID for Kallisto output :param str graphical_output: FileStoreID for output of graphing step """ job.fileStore.logToMaster('Consolidating output: {}'.format(config.uuid)) work_dir = job.fileStore.getLocalTempDir() graphical_tar, kallisto_tar = None, None # Retrieve output file paths to consolidate if kallisto_output: kallisto_tar = job.fileStore.readGlobalFile( kallisto_output, os.path.join(work_dir, 'kallisto_output.tar.gz')) if graphical_output: graphical_tar = job.fileStore.readGlobalFile( graphical_output, os.path.join(work_dir, 'single_cell_plots.tar.gz')) # I/O out_tar = os.path.join(work_dir, config.uuid + '.tar.gz') # Consolidate separate tarballs into one as streams (avoids unnecessary untaring) tar_list = [x for x in [graphical_tar, kallisto_tar] if x is not None] with tarfile.open(out_tar, 'w:gz') as f_out: for tar in tar_list: with tarfile.open(tar, 'r') as f_in: for tarinfo in f_in: with closing(f_in.extractfile(tarinfo)) as f_in_file: if tar == kallisto_tar: tarinfo.name = os.path.join( config.uuid, os.path.basename(tarinfo.name)) elif tar == graphical_tar: tarinfo.name = os.path.join( config.uuid, 'plots', os.path.basename(tarinfo.name)) f_out.addfile(tarinfo, fileobj=f_in_file) # Move to output location if urlparse(config.output_dir).scheme == 's3': job.fileStore.logToMaster('Uploading {} to S3: {}'.format( config.uuid, config.output_dir)) s3am_upload(fpath=out_tar, s3_dir=config.output_dir, num_cores=config.cores) else: job.fileStore.logToMaster('Moving {} to output dir: {}'.format( config.uuid, config.output_dir)) mkdir_p(config.output_dir) copy_files( file_paths=[os.path.join(work_dir, config.uuid + '.tar.gz')], output_dir=config.output_dir)
def process_bam_and_upload(job, bam_id, gdc_id, disk='80G'): work_dir = job.fileStore.getLocalTempDir() job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, 'input.bam')) parameters = ['fastq', '-1', '/data/R1.fastq', '-2', '/data/R2.fastq', '/data/input.bam'] docker_call(tool='quay.io/ucsc_cgl/samtools', work_dir=work_dir, parameters=parameters) subprocess.check_call(['gzip', os.path.join(work_dir, 'R1.fastq')]) subprocess.check_call(['gzip', os.path.join(work_dir, 'R2.fastq')]) out_tar = os.path.join(work_dir, gdc_id + '.tar.gz') with tarfile.open(out_tar, 'w:gz') as tar: for name in [os.path.join(work_dir, x) for x in ['R1.fastq.gz', 'R2.fastq.gz']]: tar.add(name, arcname=os.path.basename(name)) s3am_upload(out_tar, s3_dir='s3://cgl-ccle-data/')
def consolidate_output(job, config, mutect, pindel, muse): """ Combine the contents of separate tarball outputs into one via streaming :param JobFunctionWrappingJob job: passed automatically by Toil :param Namespace config: Argparse Namespace object containing argument inputs :param str mutect: MuTect tarball FileStoreID :param str pindel: Pindel tarball FileStoreID :param str muse: MuSe tarball FileStoreID """ work_dir = job.fileStore.getLocalTempDir() mutect_tar, pindel_tar, muse_tar = None, None, None if mutect: mutect_tar = job.fileStore.readGlobalFile(mutect, os.path.join(work_dir, 'mutect.tar.gz')) if pindel: pindel_tar = job.fileStore.readGlobalFile(pindel, os.path.join(work_dir, 'pindel.tar.gz')) if muse: muse_tar = job.fileStore.readGlobalFile(muse, os.path.join(work_dir, 'muse.tar.gz')) out_tar = os.path.join(work_dir, config.uuid + '.tar.gz') # Consolidate separate tarballs into one as streams (avoids unnecessary untaring) tar_list = [x for x in [mutect_tar, pindel_tar, muse_tar] if x is not None] with tarfile.open(os.path.join(work_dir, out_tar), 'w:gz') as f_out: for tar in tar_list: with tarfile.open(tar, 'r') as f_in: for tarinfo in f_in: with closing(f_in.extractfile(tarinfo)) as f_in_file: if tar is mutect_tar: tarinfo.name = os.path.join(config.uuid, 'mutect', os.path.basename(tarinfo.name)) elif tar is pindel_tar: tarinfo.name = os.path.join(config.uuid, 'pindel', os.path.basename(tarinfo.name)) else: tarinfo.name = os.path.join(config.uuid, 'muse', os.path.basename(tarinfo.name)) f_out.addfile(tarinfo, fileobj=f_in_file) # Move to output location if urlparse(config.output_dir).scheme == 's3': job.fileStore.logToMaster('Uploading {} to S3: {}'.format(config.uuid, config.output_dir)) s3am_upload(job=job, fpath=out_tar, s3_dir=config.output_dir, num_cores=config.cores) else: job.fileStore.logToMaster('Moving {} to output dir: {}'.format(config.uuid, config.output_dir)) mkdir_p(config.output_dir) copy_files(file_paths=[out_tar], output_dir=config.output_dir)
def output_file_job(job, filename, file_id, output_dir, s3_key_path=None): """ Uploads a file from the FileStore to an output directory on the local filesystem or S3. :param JobFunctionWrappingJob job: passed automatically by Toil :param str filename: basename for file :param str file_id: FileStoreID :param str output_dir: Amazon S3 URL or local path :param str s3_key_path: (OPTIONAL) Path to 32-byte key to be used for SSE-C encryption :return: """ job.fileStore.logToMaster('Writing {} to {}'.format(filename, output_dir)) work_dir = job.fileStore.getLocalTempDir() filepath = job.fileStore.readGlobalFile(file_id, os.path.join(work_dir, filename)) if urlparse(output_dir).scheme == 's3': s3am_upload(job=job, fpath=os.path.join(work_dir, filepath), s3_dir=output_dir, s3_key_path=s3_key_path) elif os.path.exists(os.path.join(output_dir, filename)): job.fileStore.logToMaster("File already exists: {}".format(filename)) else: mkdir_p(output_dir) copy_files([filepath], output_dir)
def output_file_job(job, filename, file_id, output_dir, s3_key_path=None): """ Uploads a file from the FileStore to an output directory on the local filesystem or S3. :param JobFunctionWrappingJob job: passed automatically by Toil :param str filename: basename for file :param str file_id: FileStoreID :param str output_dir: Amazon S3 URL or local path :param str s3_key_path: (OPTIONAL) Path to 32-byte key to be used for SSE-C encryption :return: """ job.fileStore.logToMaster('Writing {} to {}'.format(filename, output_dir)) work_dir = job.fileStore.getLocalTempDir() filepath = job.fileStore.readGlobalFile(file_id, os.path.join(work_dir, filename)) if urlparse(output_dir).scheme == 's3': s3am_upload(fpath=os.path.join(work_dir, filepath), s3_dir=output_dir, s3_key_path=s3_key_path) elif os.path.exists(os.path.join(output_dir, filename)): job.fileStore.logToMaster("File already exists: {}".format(filename)) else: mkdir_p(output_dir) copy_files([filepath], output_dir)
def consolidate_output(job, config, chunk_infos): #prep start = time.time() uuid = config.uuid work_dir = job.fileStore.getLocalTempDir() out_tar = os.path.join(work_dir, '{}.tar.gz'.format(config.uuid)) log(job, "{}".format(datetime.datetime.now()), uuid, 'consolidate_output') log(job, "consolidating {} files".format(len(chunk_infos)), uuid, 'consolidate_output') # build tarball out_tars = [out_tar] output_file_count = 0 with tarfile.open(out_tar, 'w:gz') as f_out: for ci in chunk_infos: file_id = ci[CI_OUTPUT_FILE_ID] tar_file = os.path.join(work_dir, "{}.tar.gz".format(ci[CI_CHUNK_INDEX])) job.fileStore.readGlobalFile(file_id, tar_file) out_tars.append(tar_file) with tarfile.open(tar_file, 'r') as f_in: for tarinfo in f_in: if config.minimal_output and ( (tarinfo.name.endswith("bam") or tarinfo.name.endswith("sam") or tarinfo.name.endswith("bai")) and ID_MERGED not in tarinfo.name): log( job, "(Minimal Output) Skipping output file: {}".format( tarinfo.name), uuid, 'consolidate_output') continue if config.minimal_cpecan_output and tarinfo.name.endswith( "gz"): log( job, "(Minimal cPecan Output) Skipping output file: {}". format(tarinfo.name), uuid, 'consolidate_output') continue log(job, "file {}".format(tarinfo.name), uuid, 'consolidate_output') with closing(f_in.extractfile(tarinfo)) as f_in_file: f_out.addfile(tarinfo, fileobj=f_in_file) output_file_count += 1 log( job, "Consolidated {} files in {} tarballs".format(output_file_count, len(out_tars)), uuid, 'consolidate_output') # Move to output location if urlparse(config.output_dir).scheme == 's3': log(job, "Uploading {} to S3: {}".format(out_tar, config.output_dir), uuid, 'consolidate_output') s3am_upload(fpath=out_tar, s3_dir=config.output_dir, num_cores=config.maxCores) else: log(job, "Moving {} to output dir: {}".format(out_tar, config.output_dir), uuid, 'consolidate_output') mkdir_p(config.output_dir) copy_files(file_paths=[out_tar], output_dir=config.output_dir) # log log_generic_job_debug(job, config.uuid, "consolidate_output", work_dir=work_dir) log_time(job, "consolidate_output", start, config.uuid) log(job, "{}".format(datetime.datetime.now()), uuid, 'END') # return location (calculated the same whether s3:// or file:// return os.path.join(config.output_dir, os.path.basename(out_tar))
def consolidate_output(job, config, kallisto_output, rsem_star_output, fastqc_output): """ Combines the contents of the outputs into one tarball and places in output directory or s3 :param JobFunctionWrappingJob job: passed automatically by Toil :param Namespace config: Argparse Namespace object containing argument inputs :param FileID kallisto_output: FileStoreID for Kallisto output :param tuple(FileID, FileID, FileID)|tuple(FileID, FileID, FileID, bool, FileID) rsem_star_output: FileStoreIDs for RSEM and STAR output, and a flag/FileID if run with bamQC :param FileID fastqc_output: FileStoreID for FastQC output """ job.fileStore.logToMaster('Consolidating output: {}'.format(config.uuid)) work_dir = job.fileStore.getLocalTempDir() config.uuid = 'SINGLE-END.' + config.uuid if not config.paired else config.uuid # Retrieve output file paths to consolidate rsem_tar, hugo_tar, kallisto_tar, fastqc_tar, bamqc_tar, star_tar = None, None, None, None, None, None if rsem_star_output: if config.bamqc: rsem_id, hugo_id, star_id, fail_flag, bamqc_id = flatten( rsem_star_output) bamqc_tar = job.fileStore.readGlobalFile( bamqc_id, os.path.join(work_dir, 'bamqc.tar.gz')) config.uuid = 'FAIL.' + config.uuid if fail_flag else config.uuid else: rsem_id, hugo_id, star_id = flatten(rsem_star_output) rsem_tar = job.fileStore.readGlobalFile( rsem_id, os.path.join(work_dir, 'rsem.tar.gz')) hugo_tar = job.fileStore.readGlobalFile( hugo_id, os.path.join(work_dir, 'rsem_hugo.tar.gz')) star_tar = job.fileStore.readGlobalFile( star_id, os.path.join(work_dir, 'star.tar.gz')) if kallisto_output: kallisto_tar = job.fileStore.readGlobalFile( kallisto_output, os.path.join(work_dir, 'kallisto.tar.gz')) if fastqc_output: fastqc_tar = job.fileStore.readGlobalFile( fastqc_output, os.path.join(work_dir, 'fastqc.tar.gz')) # I/O out_tar = os.path.join(work_dir, config.uuid + '.tar.gz') # Consolidate separate tarballs into one as streams (avoids unnecessary untaring) tar_list = [ x for x in [rsem_tar, hugo_tar, kallisto_tar, fastqc_tar, bamqc_tar, star_tar] if x is not None ] with tarfile.open(out_tar, 'w:gz') as f_out: for tar in tar_list: with tarfile.open(tar, 'r') as f_in: for tarinfo in f_in: with closing(f_in.extractfile(tarinfo)) as f_in_file: if tar == rsem_tar: tarinfo.name = os.path.join( config.uuid, 'RSEM', os.path.basename(tarinfo.name)) elif tar == hugo_tar: tarinfo.name = os.path.join( config.uuid, 'RSEM', 'Hugo', os.path.basename(tarinfo.name)) elif tar == kallisto_tar: tarinfo.name = os.path.join( config.uuid, 'Kallisto', os.path.basename(tarinfo.name)) elif tar == bamqc_tar: tarinfo.name = os.path.join( config.uuid, 'QC', 'bamQC', os.path.basename(tarinfo.name)) elif tar == fastqc_tar: tarinfo.name = os.path.join( config.uuid, 'QC', 'fastQC', os.path.basename(tarinfo.name)) elif tar == star_tar: tarinfo.name = os.path.join( config.uuid, 'QC', 'STAR', os.path.basename(tarinfo.name)) f_out.addfile(tarinfo, fileobj=f_in_file) # Move to output location if urlparse(config.output_dir).scheme == 's3': job.fileStore.logToMaster('Uploading {} to S3: {}'.format( config.uuid, config.output_dir)) s3am_upload(fpath=out_tar, s3_dir=config.output_dir, num_cores=config.cores) else: job.fileStore.logToMaster('Moving {} to output dir: {}'.format( config.uuid, config.output_dir)) mkdir_p(config.output_dir) copy_files( file_paths=[os.path.join(work_dir, config.uuid + '.tar.gz')], output_dir=config.output_dir)