def test_upload_and_download_with_encryption(tmpdir): from toil_scripts.lib.urls import s3am_upload from toil_scripts.lib.urls import download_url from boto.s3.connection import S3Connection, Bucket, Key work_dir = str(tmpdir) # Create temporary encryption key key_path = os.path.join(work_dir, 'foo.key') subprocess.check_call(['dd', 'if=/dev/urandom', 'bs=1', 'count=32', 'of={}'.format(key_path)]) # Create test file upload_fpath = os.path.join(work_dir, 'upload_file') with open(upload_fpath, 'wb') as fout: fout.write(os.urandom(1024)) # Upload file random_key = os.path.join('test/', str(uuid4()), 'upload_file') s3_url = os.path.join('s3://cgl-driver-projects/', random_key) try: s3_dir = os.path.split(s3_url)[0] s3am_upload(fpath=upload_fpath, s3_dir=s3_dir, s3_key_path=key_path) # Download the file download_url(url=s3_url, name='download_file', work_dir=work_dir, s3_key_path=key_path) download_fpath = os.path.join(work_dir, 'download_file') assert os.path.exists(download_fpath) assert filecmp.cmp(upload_fpath, download_fpath) finally: # Delete the Key. Key deletion never fails so we don't need to catch any exceptions with closing(S3Connection()) as conn: b = Bucket(conn, 'cgl-driver-projects') k = Key(b) k.key = random_key k.delete()
def test_upload_and_download_with_encryption(tmpdir): from toil_scripts.lib.urls import s3am_upload from toil_scripts.lib.urls import download_url from boto.s3.connection import S3Connection, Bucket, Key work_dir = str(tmpdir) # Create temporary encryption key key_path = os.path.join(work_dir, 'foo.key') subprocess.check_call(['dd', 'if=/dev/urandom', 'bs=1', 'count=32', 'of={}'.format(key_path)]) # Create test file upload_fpath = os.path.join(work_dir, 'upload_file') with open(upload_fpath, 'wb') as fout: fout.write(os.urandom(1024)) # Upload file s3_dir = 's3://cgl-driver-projects/test' s3am_upload(fpath=upload_fpath, s3_dir=s3_dir, s3_key_path=key_path) # Download the file url = 'https://s3-us-west-2.amazonaws.com/cgl-driver-projects/test/upload_file' download_url(url=url, name='download_file', work_dir=work_dir, s3_key_path=key_path) download_fpath = os.path.join(work_dir, 'download_file') assert os.path.exists(download_fpath) assert filecmp.cmp(upload_fpath, download_fpath) # Delete the Key conn = S3Connection() b = Bucket(conn, 'cgl-driver-projects') k = Key(b) k.key = 'test/upload_file' k.delete()
def run_kallisto(job, cores, r1_id, r2_id, kallisto_index_url): """ RNA quantification via Kallisto :param JobFunctionWrappingJob job: passed automatically by Toil :param int cores: Number of cores to run Kallisto with :param str r1_id: FileStoreID of fastq (pair 1) :param str r2_id: FileStoreID of fastq (pair 2 if applicable, otherwise pass None for single-end) :param str kallisto_index_url: FileStoreID for Kallisto index file :return: FileStoreID from Kallisto output :rtype: str """ work_dir = job.fileStore.getLocalTempDir() download_url(url=kallisto_index_url, name='kallisto_hg38.idx', work_dir=work_dir) # Retrieve files parameters = ['quant', '-i', '/data/kallisto_hg38.idx', '-t', str(cores), '-o', '/data/', '-b', '100'] if r1_id and r2_id: job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1_cutadapt.fastq')) job.fileStore.readGlobalFile(r2_id, os.path.join(work_dir, 'R2_cutadapt.fastq')) parameters.extend(['/data/R1_cutadapt.fastq', '/data/R2_cutadapt.fastq']) else: job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1_cutadapt.fastq')) parameters.extend(['--single', '-l', '200', '-s', '15', '/data/R1_cutadapt.fastq']) # Call: Kallisto docker_call(tool='quay.io/ucsc_cgl/kallisto:0.42.4--35ac87df5b21a8e8e8d159f26864ac1e1db8cf86', work_dir=work_dir, parameters=parameters) # Tar output files together and store in fileStore output_files = [os.path.join(work_dir, x) for x in ['run_info.json', 'abundance.tsv', 'abundance.h5']] tarball_files(tar_name='kallisto.tar.gz', file_paths=output_files, output_dir=work_dir) return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'kallisto.tar.gz'))
def run_star(job, cores, r1_id, r2_id, star_index_url, wiggle=False): """ Performs alignment of fastqs to bam via STAR :param JobFunctionWrappingJob job: passed automatically by Toil :param int cores: Number of cores to run star with :param str r1_id: FileStoreID of fastq (pair 1) :param str r2_id: FileStoreID of fastq (pair 2 if applicable, else pass None) :param str star_index_url: STAR index tarball :param bool wiggle: If True, will output a wiggle file and return it :return: FileStoreID from RSEM :rtype: str """ work_dir = job.fileStore.getLocalTempDir() download_url(url=star_index_url, name='starIndex.tar.gz', work_dir=work_dir) subprocess.check_call(['tar', '-xvf', os.path.join(work_dir, 'starIndex.tar.gz'), '-C', work_dir]) os.remove(os.path.join(work_dir, 'starIndex.tar.gz')) # Determine tarball structure - star index contains are either in a subdir or in the tarball itself star_index = os.path.join('/data', os.listdir(work_dir)[0]) if len(os.listdir(work_dir)) == 1 else '/data' # Parameter handling for paired / single-end data parameters = ['--runThreadN', str(cores), '--genomeDir', star_index, '--outFileNamePrefix', 'rna', '--outSAMtype', 'BAM', 'SortedByCoordinate', '--outSAMunmapped', 'Within', '--quantMode', 'TranscriptomeSAM', '--outSAMattributes', 'NH', 'HI', 'AS', 'NM', 'MD', '--outFilterType', 'BySJout', '--outFilterMultimapNmax', '20', '--outFilterMismatchNmax', '999', '--outFilterMismatchNoverReadLmax', '0.04', '--alignIntronMin', '20', '--alignIntronMax', '1000000', '--alignMatesGapMax', '1000000', '--alignSJoverhangMin', '8', '--alignSJDBoverhangMin', '1', '--sjdbScore', '1'] if wiggle: parameters.extend(['--outWigType', 'bedGraph', '--outWigStrand', 'Unstranded', '--outWigReferencesPrefix', 'chr']) if r1_id and r2_id: job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq')) job.fileStore.readGlobalFile(r2_id, os.path.join(work_dir, 'R2.fastq')) parameters.extend(['--readFilesIn', '/data/R1.fastq', '/data/R2.fastq']) else: job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1_cutadapt.fastq')) parameters.extend(['--readFilesIn', '/data/R1.fastq']) # Call: STAR Mapping docker_call(tool='quay.io/ucsc_cgl/star:2.4.2a--bcbd5122b69ff6ac4ef61958e47bde94001cfe80', work_dir=work_dir, parameters=parameters) # Write to fileStore transcriptome_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaAligned.toTranscriptome.out.bam')) sorted_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam')) if wiggle: wiggle_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaSignal.UniqueMultiple.str1.out.bg')) return transcriptome_id, sorted_id, wiggle_id else: return transcriptome_id, sorted_id
def spladder(job, inputs, bam_id, bai_id): """ Run SplAdder to detect and quantify alternative splicing events :param JobFunctionWrappingJob job: passed by Toil automatically :param Namespace inputs: Stores input arguments (see main) :param str bam_id: FileStore ID of bam :param str bai_id: FileStore ID of bam index file :return: FileStore ID of SplAdder tarball :rtype: str """ job.fileStore.logToMaster('SplAdder: {}'.format(inputs.uuid)) work_dir = job.fileStore.getLocalTempDir() # Pull in alignment.bam from fileStore job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, 'alignment.bam')) job.fileStore.readGlobalFile(bai_id, os.path.join(work_dir, 'alignment.bam.bai')) # Download input file download_url(url=inputs.gtf, work_dir=work_dir, name='annotation.gtf') download_url(url=inputs.gtf_pickle, work_dir=work_dir, name='annotation.gtf.pickle') # Call Spladder command = ['--insert_ir=y', '--insert_es=y', '--insert_ni=y', '--remove_se=n', '--validate_sg=n', '-b', 'alignment.bam', '-o ', '/data', '-a', 'annotation.gtf', '-v', 'y', '-c', '3', '-M', 'single', '-T', 'n', '-n', '50', '-P', 'y', '-p', 'n', '--sparse_bam', 'y'] docker_call(work_dir=work_dir, parameters=command, sudo=inputs.sudo, tool='jvivian/spladder:1.0') # Write output to fileStore and return ids output_pickle = os.path.join(work_dir, ' ', 'spladder', 'genes_graph_conf3.alignment.pickle') if not os.path.exists(output_pickle): matches = [] for root, dirnames, filenames in os.walk(work_dir): for filename in fnmatch.filter(filenames, '*genes_graph*'): matches.append(os.path.join(root, filename)) if matches: output_pickle = matches[0] else: raise RuntimeError("Couldn't find genes file!") output_filt = os.path.join(work_dir, 'alignment.filt.hdf5') output = os.path.join(work_dir, 'alignment.hdf5') print os.listdir(work_dir) tarball_files('spladder.tar.gz', file_paths=[output_pickle, output_filt, output], output_dir=work_dir) return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'spladder.tar.gz'))
def star(job, inputs, r1_cutadapt, r2_cutadapt): """ Performs alignment of fastqs to BAM via STAR :param JobFunctionWrappingJob job: passed by Toil automatically :param Namespace inputs: Stores input arguments (see main) :param str r1_cutadapt: FileStore ID of read 1 fastq :param str r2_cutadapt: FileStore ID of read 2 fastq """ job.fileStore.logToMaster('Aligning with STAR: {}'.format(inputs.uuid)) work_dir = job.fileStore.getLocalTempDir() cores = min(inputs.cores, 16) # Retrieve files job.fileStore.readGlobalFile(r1_cutadapt, os.path.join(work_dir, 'R1_cutadapt.fastq')) job.fileStore.readGlobalFile(r2_cutadapt, os.path.join(work_dir, 'R2_cutadapt.fastq')) # Get starIndex download_url(inputs.star_index, work_dir, 'starIndex.tar.gz') subprocess.check_call(['tar', '-xvf', os.path.join(work_dir, 'starIndex.tar.gz'), '-C', work_dir]) # Parameters parameters = ['--runThreadN', str(cores), '--genomeDir', '/data/starIndex', '--outFileNamePrefix', 'rna', '--outSAMtype', 'BAM', 'SortedByCoordinate', '--outSAMunmapped', 'Within', '--quantMode', 'TranscriptomeSAM', '--outSAMattributes', 'NH', 'HI', 'AS', 'NM', 'MD', '--outFilterType', 'BySJout', '--outFilterMultimapNmax', '20', '--outFilterMismatchNmax', '999', '--outFilterMismatchNoverReadLmax', '0.04', '--alignIntronMin', '20', '--alignIntronMax', '1000000', '--alignMatesGapMax', '1000000', '--alignSJoverhangMin', '8', '--alignSJDBoverhangMin', '1', '--sjdbScore', '1', '--readFilesIn', '/data/R1_cutadapt.fastq', '/data/R2_cutadapt.fastq'] # Call: STAR Map docker_call(tool='quay.io/ucsc_cgl/star:2.4.2a--bcbd5122b69ff6ac4ef61958e47bde94001cfe80', work_dir=work_dir, parameters=parameters) # Call Samtools Index index_command = ['index', '/data/rnaAligned.sortedByCoord.out.bam'] docker_call(work_dir=work_dir, parameters=index_command, tool='quay.io/ucsc_cgl/samtools:1.3--256539928ea162949d8a65ca5c79a72ef557ce7c') # fileStore bam_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam')) bai_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam.bai')) job.fileStore.deleteGlobalFile(r1_cutadapt) job.fileStore.deleteGlobalFile(r2_cutadapt) # Launch children and follow-on vcqc_id = job.addChildJobFn(variant_calling_and_qc, inputs, bam_id, bai_id, cores=2, disk='30G').rv() spladder_id = job.addChildJobFn(spladder, inputs, bam_id, bai_id, disk='30G').rv() job.addFollowOnJobFn(consolidate_output_tarballs, inputs, vcqc_id, spladder_id, disk='30G')
def run_rsem(job, cores, bam_id, rsem_ref_url, paired=True): """ RNA quantification with RSEM :param JobFunctionWrappingJob job: Passed automatically by Toil :param int cores: Number of cores to run RSEM with :param str bam_id: FileStoreID of transcriptome bam for quantification :param str rsem_ref_url: URL of RSEM reference (tarball) :param bool paired: If True, uses parameters for paired end data :return: FileStoreIDs for RSEM's gene and isoform output :rtype: str """ work_dir = job.fileStore.getLocalTempDir() download_url(url=rsem_ref_url, name='rsem_ref.tar.gz', work_dir=work_dir) subprocess.check_call(['tar', '-xvf', os.path.join(work_dir, 'rsem_ref.tar.gz'), '-C', work_dir]) os.remove(os.path.join(work_dir, 'rsem_ref.tar.gz')) # Determine tarball structure - based on it, ascertain folder name and rsem reference prefix rsem_files = [] for root, directories, files in os.walk(work_dir): rsem_files.extend([os.path.join(root, x) for x in files]) # "grp" is a required RSEM extension that should exist in the RSEM reference ref_prefix = [os.path.basename(os.path.splitext(x)[0]) for x in rsem_files if 'grp' in x][0] ref_folder = os.path.join('/data', os.listdir(work_dir)[0]) if len(os.listdir(work_dir)) == 1 else '/data' # I/O job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, 'transcriptome.bam')) output_prefix = 'rsem' # Call: RSEM parameters = ['--quiet', '--no-qualities', '-p', str(cores), '--forward-prob', '0.5', '--seed-length', '25', '--fragment-length-mean', '-1.0', '--bam', '/data/transcriptome.bam', os.path.join(ref_folder, ref_prefix), output_prefix] if paired: parameters = ['--paired-end'] + parameters docker_call(tool='quay.io/ucsc_cgl/rsem:1.2.25--d4275175cc8df36967db460b06337a14f40d2f21', parameters=parameters, work_dir=work_dir) os.rename(os.path.join(work_dir, output_prefix + '.genes.results'), os.path.join(work_dir, 'rsem_gene.tab')) os.rename(os.path.join(work_dir, output_prefix + '.isoforms.results'), os.path.join(work_dir, 'rsem_isoform.tab')) # Write to FileStore gene_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rsem_gene.tab')) isoform_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rsem_isoform.tab')) return gene_id, isoform_id
def variant_calling_and_qc(job, inputs, bam_id, bai_id): """ Perform variant calling with samtools nad QC with CheckBias :param JobFunctionWrappingJob job: passed by Toil automatically :param Namespace inputs: Stores input arguments (see main) :param str bam_id: FileStore ID of bam :param str bai_id: FileStore ID of bam index file :return: FileStore ID of qc tarball :rtype: str """ job.fileStore.logToMaster('Variant calling and QC: {}'.format(inputs.uuid)) work_dir = job.fileStore.getLocalTempDir() # Pull in alignment.bam from fileStore job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, 'alignment.bam')) job.fileStore.readGlobalFile(bai_id, os.path.join(work_dir, 'alignment.bam.bai')) # Download input files input_info = [(inputs.genome, 'genome.fa'), (inputs.positions, 'positions.tsv'), (inputs.genome_index, 'genome.fa.fai'), (inputs.gtf, 'annotation.gtf'), (inputs.gtf_m53, 'annotation.m53')] for url, fname in input_info: download_url(url, work_dir=work_dir, name=fname) # Part 1: Variant Calling variant_command = ['mpileup', '-f', 'genome.fa', '-l', 'positions.tsv', '-v', 'alignment.bam', '-t', 'DP,SP,INFO/AD,INFO/ADF,INFO/ADR,INFO/DPR,SP', '-o', '/data/output.vcf.gz'] docker_call(work_dir=work_dir, parameters=variant_command, sudo=inputs.sudo, tool='quay.io/ucsc_cgl/samtools:1.3--256539928ea162949d8a65ca5c79a72ef557ce7c') # Part 2: QC qc_command = ['-o', 'qc', '-n', 'alignment.bam', '-a', 'annotation.gtf', '-m', 'annotation.m53'] docker_call(work_dir=work_dir, parameters=qc_command, tool='jvivian/checkbias:612f129--b08a1fb6526a620bbb0304b08356f2ae7c3c0ec3') # Write output to fileStore and return ids output_tsv = glob(os.path.join(work_dir, '*counts.tsv*'))[0] output_vcf = os.path.join(work_dir, 'output.vcf.gz') tarball_files('vcqc.tar.gz', file_paths=[output_tsv, output_vcf], output_dir=work_dir) return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'vcqc.tar.gz'))
def test_download_url(tmpdir): from toil_scripts.lib.urls import download_url work_dir = str(tmpdir) download_url(work_dir=work_dir, url='www.google.com', name='testy') assert os.path.exists(os.path.join(work_dir, 'testy'))
def docker_call(tool, parameters=None, work_dir='.', rm=True, env=None, outfile=None, inputs=None, outputs=None, docker_parameters=None, check_output=False, mock=None): """ Calls Docker, passing along parameters and tool. :param str tool: Name of the Docker image to be used (e.g. quay.io/ucsc_cgl/samtools) :param list[str] parameters: Command line arguments to be passed to the tool :param str work_dir: Directory to mount into the container via `-v`. Destination convention is /data :param bool rm: Set to True to pass `--rm` flag. :param dict[str,str] env: Environment variables to be added (e.g. dict(JAVA_OPTS='-Xmx15G')) :param bool sudo: If True, prepends `sudo` to the docker call :param file outfile: Pipe output of Docker call to file handle :param list[str] inputs: A list of the input files. :param dict[str,str] outputs: A dictionary containing the outputs files as keys with either None or a url. The value is only used if mock=True :param dict[str,str] docker_parameters: Parameters to pass to docker :param bool check_output: When True, this function returns docker's output :param bool mock: Whether to run in mock mode. If this variable is unset, its value will be determined by the environment variable. """ from toil_scripts.lib.urls import download_url if mock is None: mock = mock_mode() if parameters is None: parameters = [] if inputs is None: inputs = [] if outputs is None: outputs = {} for filename in inputs: assert(os.path.isfile(os.path.join(work_dir, filename))) if mock: for filename, url in outputs.items(): file_path = os.path.join(work_dir, filename) if url is None: # create mock file if not os.path.exists(file_path): f = open(file_path, 'w') f.write("contents") # FIXME f.close() else: file_path = os.path.join(work_dir, filename) if not os.path.exists(file_path): outfile = download_url(url, work_dir=work_dir, name=filename) assert os.path.exists(file_path) return base_docker_call = ['docker', 'run', '--log-driver=none', '-v', '{}:/data'.format(os.path.abspath(work_dir))] if rm: base_docker_call.append('--rm') if env: for e, v in env.iteritems(): base_docker_call.extend(['-e', '{}={}'.format(e, v)]) if docker_parameters: base_docker_call += docker_parameters _log.debug("Calling docker with %s." % " ".join(base_docker_call + [tool] + parameters)) docker_call = base_docker_call + [tool] + parameters try: if outfile: subprocess.check_call(docker_call, stdout=outfile) else: if check_output: return subprocess.check_output(docker_call) else: subprocess.check_call(docker_call) # Fix root ownership of output files except: # Panic avoids hiding the exception raised in the try block with panic(): _fix_permissions(base_docker_call, tool, work_dir) else: _fix_permissions(base_docker_call, tool, work_dir) for filename in outputs.keys(): if not os.path.isabs(filename): filename = os.path.join(work_dir, filename) assert(os.path.isfile(filename))