def download_run_and_upload(job, master_ip, inputs, spark_on_toil): """ Monolithic job that calls data download, conversion, transform, upload. Previously, this was not monolithic; change came in due to #126/#134. """ master_ip = MasterAddress(master_ip) bam_name = inputs.sample.split('://')[-1].split('/')[-1] sample_name = ".".join(os.path.splitext(bam_name)[:-1]) hdfs_subdir = sample_name + "-dir" if inputs.run_local: inputs.local_dir = job.fileStore.getLocalTempDir() if inputs.native_adam_path is None: hdfs_dir = "/data/" else: hdfs_dir = inputs.local_dir else: inputs.local_dir = None hdfs_dir = "hdfs://{0}:{1}/{2}".format(master_ip, HDFS_MASTER_PORT, hdfs_subdir) try: hdfs_prefix = hdfs_dir + "/" + sample_name hdfs_bam = hdfs_dir + "/" + bam_name hdfs_snps = hdfs_dir + "/" + inputs.dbsnp.split('://')[-1].split( '/')[-1] if not inputs.run_local: download_data(job, master_ip, inputs, inputs.dbsnp, inputs.sample, hdfs_snps, hdfs_bam) else: copy_files([inputs.sample, inputs.dbsnp], inputs.local_dir) adam_input = hdfs_prefix + ".adam" adam_snps = hdfs_dir + "/snps.var.adam" adam_convert(job, master_ip, inputs, hdfs_bam, hdfs_snps, adam_input, adam_snps, spark_on_toil) adam_output = hdfs_prefix + ".processed.bam" adam_transform(job, master_ip, inputs, adam_input, adam_snps, hdfs_dir, adam_output, spark_on_toil) out_file = inputs.output_dir + "/" + sample_name + inputs.suffix + ".bam" if not inputs.run_local: upload_data(job, master_ip, inputs, adam_output, out_file, spark_on_toil) else: local_adam_output = "%s/%s.processed.bam" % (inputs.local_dir, sample_name) move_files([local_adam_output], inputs.output_dir) remove_file(master_ip, hdfs_subdir, spark_on_toil) except: remove_file(master_ip, hdfs_subdir, spark_on_toil) raise
def test_copy_files(tmpdir): from toil_lib.files import copy_files work_dir = str(tmpdir) os.mkdir(os.path.join(work_dir, 'test')) fpath = os.path.join(work_dir, 'output_file') with open(fpath, 'wb') as fout: fout.write(os.urandom(1024)) copy_files([fpath], os.path.join(work_dir, 'test')) assert os.path.exists(os.path.join(work_dir, 'test', 'output_file'))
def consolidate_output(job, config, mutect, pindel, muse): """ Combine the contents of separate tarball outputs into one via streaming :param JobFunctionWrappingJob job: passed automatically by Toil :param Namespace config: Argparse Namespace object containing argument inputs :param str mutect: MuTect tarball FileStoreID :param str pindel: Pindel tarball FileStoreID :param str muse: MuSe tarball FileStoreID """ work_dir = job.fileStore.getLocalTempDir() mutect_tar, pindel_tar, muse_tar = None, None, None if mutect: mutect_tar = job.fileStore.readGlobalFile( mutect, os.path.join(work_dir, 'mutect.tar.gz')) if pindel: pindel_tar = job.fileStore.readGlobalFile( pindel, os.path.join(work_dir, 'pindel.tar.gz')) if muse: muse_tar = job.fileStore.readGlobalFile( muse, os.path.join(work_dir, 'muse.tar.gz')) out_tar = os.path.join(work_dir, config.uuid + '.tar.gz') # Consolidate separate tarballs into one as streams (avoids unnecessary untaring) tar_list = [x for x in [mutect_tar, pindel_tar, muse_tar] if x is not None] with tarfile.open(os.path.join(work_dir, out_tar), 'w:gz') as f_out: for tar in tar_list: with tarfile.open(tar, 'r') as f_in: for tarinfo in f_in: with closing(f_in.extractfile(tarinfo)) as f_in_file: if tar is mutect_tar: tarinfo.name = os.path.join( config.uuid, 'mutect', os.path.basename(tarinfo.name)) elif tar is pindel_tar: tarinfo.name = os.path.join( config.uuid, 'pindel', os.path.basename(tarinfo.name)) else: tarinfo.name = os.path.join( config.uuid, 'muse', os.path.basename(tarinfo.name)) f_out.addfile(tarinfo, fileobj=f_in_file) # Move to output location if urlparse(config.output_dir).scheme == 's3': job.fileStore.logToMaster('Uploading {} to S3: {}'.format( config.uuid, config.output_dir)) s3am_upload(job=job, fpath=out_tar, s3_dir=config.output_dir, num_cores=config.cores) else: job.fileStore.logToMaster('Moving {} to output dir: {}'.format( config.uuid, config.output_dir)) mkdir_p(config.output_dir) copy_files(file_paths=[out_tar], output_dir=config.output_dir)
def rsem_quantification(job, config, star_output): """ Unpack STAR results and run RSEM (and saving BAM from STAR) :param JobFunctionWrappingJob job: passed automatically by Toil :param Namespace config: Argparse Namespace object containing argument inputs :param tuple(FileID, FileID, FileID, FileID)|tuple(FileID, FileID, FileID) star_output: FileStoreIDs from STAR :return: FileStoreID results from RSEM postprocess and STAR log :rtype: tuple(FileID, FileID, FileID) """ work_dir = job.fileStore.getLocalTempDir() cores = min(16, config.cores) if config.wiggle: transcriptome_id, sorted_id, wiggle_id, log_id = flatten(star_output) wiggle_path = os.path.join(work_dir, config.uuid + '.wiggle.bg') job.fileStore.readGlobalFile(wiggle_id, wiggle_path) if urlparse(config.output_dir).scheme == 's3': s3am_upload(fpath=wiggle_path, s3_dir=config.output_dir, s3_key_path=config.ssec) else: copy_files(file_paths=[wiggle_path], output_dir=config.output_dir) else: transcriptome_id, sorted_id, log_id = star_output # Save sorted bam if flag is selected if config.save_bam and not config.bamqc: # if config.bamqc is selected, bam is being saved in run_bam_qc bam_path = os.path.join(work_dir, config.uuid + '.sorted.bam') job.fileStore.readGlobalFile(sorted_id, bam_path) if urlparse(config.output_dir).scheme == 's3' and config.ssec: s3am_upload(fpath=bam_path, s3_dir=config.output_dir, s3_key_path=config.ssec) elif urlparse(config.output_dir).scheme != 's3': copy_files(file_paths=[bam_path], output_dir=config.output_dir) # Declare RSEM and RSEM post-process jobs disk = 5 * transcriptome_id.size rsem_output = job.wrapJobFn(run_rsem, transcriptome_id, config.rsem_ref, paired=config.paired, cores=cores, disk=disk) rsem_postprocess = job.wrapJobFn(run_rsem_postprocess, rsem_output.rv(0), rsem_output.rv(1)) job.addChild(rsem_output) rsem_output.addChild(rsem_postprocess) # Save STAR log log_path = os.path.join(work_dir, 'Log.final.out') job.fileStore.readGlobalFile(log_id, log_path) tarball_files(tar_name='star.tar.gz', file_paths=[log_path], output_dir=work_dir) star_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'star.tar.gz')) return rsem_postprocess.rv(), star_id
def run_bam_qc(job, aligned_bam_id, config): """ Run BAM QC as specified by California Kids Cancer Comparison (CKCC) :param JobFunctionWrappingJob job: :param str aligned_bam_id: FileStoreID of sorted bam from STAR :param Namespace config: Argparse Namespace object containing argument inputs Must contain: config.uuid str: UUID of input sample config.save_bam bool: True/False depending on whether to save bam config.output_dir str: Path to save bam config.ssec str: Path to encryption key for secure upload to S3 :return: boolean flag, FileStoreID for output bam, and FileStoreID for output tar :rtype: tuple(bool, str, str) """ work_dir = job.fileStore.getLocalTempDir() job.fileStore.readGlobalFile( aligned_bam_id, os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam')) docker_call(tool='hbeale/treehouse_bam_qc:1.0', work_dir=work_dir, parameters=['runQC.sh', str(job.cores)]) # Tar Output files output_names = [ 'readDist.txt', 'rnaAligned.out.md.sorted.geneBodyCoverage.curves.pdf', 'rnaAligned.out.md.sorted.geneBodyCoverage.txt' ] if os.path.exists(os.path.join(work_dir, 'readDist.txt_PASS_qc.txt')): output_names.append('readDist.txt_PASS_qc.txt') fail_flag = False else: output_names.append('readDist.txt_FAIL_qc.txt') fail_flag = True output_files = [os.path.join(work_dir, x) for x in output_names] tarball_files(tar_name='bam_qc.tar.gz', file_paths=output_files, output_dir=work_dir) # Save output BAM if config.save_bam: bam_path = os.path.join(work_dir, 'rnaAligned.sortedByCoord.md.bam') new_bam_path = os.path.join(work_dir, config.uuid + '.sortedByCoord.md.bam') os.rename(bam_path, new_bam_path) if urlparse(config.output_dir).scheme == 's3' and config.ssec: s3am_upload(fpath=new_bam_path, s3_dir=config.output_dir, s3_key_path=config.ssec) elif urlparse(config.output_dir).scheme != 's3': copy_files(file_paths=[new_bam_path], output_dir=config.output_dir) return fail_flag, job.fileStore.writeGlobalFile( os.path.join(work_dir, 'bam_qc.tar.gz'))
def download_run_and_upload(job, master_ip, inputs, spark_on_toil): """ Monolithic job that calls data download, conversion, transform, upload. Previously, this was not monolithic; change came in due to #126/#134. """ master_ip = MasterAddress(master_ip) bam_name = inputs.sample.split('://')[-1].split('/')[-1] sample_name = ".".join(os.path.splitext(bam_name)[:-1]) hdfs_subdir = sample_name + "-dir" if inputs.run_local: inputs.local_dir = job.fileStore.getLocalTempDir() if inputs.native_adam_path is None: hdfs_dir = "/data/" else: hdfs_dir = inputs.local_dir else: inputs.local_dir = None hdfs_dir = "hdfs://{0}:{1}/{2}".format(master_ip, HDFS_MASTER_PORT, hdfs_subdir) try: hdfs_prefix = hdfs_dir + "/" + sample_name hdfs_bam = hdfs_dir + "/" + bam_name hdfs_snps = hdfs_dir + "/" + inputs.dbsnp.split('://')[-1].split('/')[-1] if not inputs.run_local: download_data(job, master_ip, inputs, inputs.dbsnp, inputs.sample, hdfs_snps, hdfs_bam) else: copy_files([inputs.sample, inputs.dbsnp], inputs.local_dir) adam_input = hdfs_prefix + ".adam" adam_snps = hdfs_dir + "/snps.var.adam" adam_convert(job, master_ip, inputs, hdfs_bam, hdfs_snps, adam_input, adam_snps, spark_on_toil) adam_output = hdfs_prefix + ".processed.bam" adam_transform(job, master_ip, inputs, adam_input, adam_snps, hdfs_dir, adam_output, spark_on_toil) out_file = inputs.output_dir + "/" + sample_name + inputs.suffix + ".bam" if not inputs.run_local: upload_data(job, master_ip, inputs, adam_output, out_file, spark_on_toil) else: local_adam_output = "%s/%s.processed.bam" % (inputs.local_dir, sample_name) move_files([local_adam_output], inputs.output_dir) remove_file(master_ip, hdfs_subdir, spark_on_toil) except: remove_file(master_ip, hdfs_subdir, spark_on_toil) raise
def consolidate_output(job, config, kallisto_output, graphical_output): """ Combines the contents of the outputs into one tarball and places in output directory or s3 :param JobFunctionWrappingJob job: passed automatically by Toil :param Namespace config: Argparse Namespace object containing argument inputs :param str kallisto_output: FileStoreID for Kallisto output :param str graphical_output: FileStoreID for output of graphing step """ job.fileStore.logToMaster('Consolidating output: {}'.format(config.uuid)) work_dir = job.fileStore.getLocalTempDir() graphical_tar, kallisto_tar = None, None # Retrieve output file paths to consolidate if kallisto_output: kallisto_tar = job.fileStore.readGlobalFile( kallisto_output, os.path.join(work_dir, 'kallisto_output.tar.gz')) if graphical_output: graphical_tar = job.fileStore.readGlobalFile( graphical_output, os.path.join(work_dir, 'single_cell_plots.tar.gz')) # I/O out_tar = os.path.join(work_dir, config.uuid + '.tar.gz') # Consolidate separate tarballs into one as streams (avoids unnecessary untaring) tar_list = [x for x in [graphical_tar, kallisto_tar] if x is not None] with tarfile.open(out_tar, 'w:gz') as f_out: for tar in tar_list: with tarfile.open(tar, 'r') as f_in: for tarinfo in f_in: with closing(f_in.extractfile(tarinfo)) as f_in_file: if tar == kallisto_tar: tarinfo.name = os.path.join( config.uuid, os.path.basename(tarinfo.name)) elif tar == graphical_tar: tarinfo.name = os.path.join( config.uuid, 'plots', os.path.basename(tarinfo.name)) f_out.addfile(tarinfo, fileobj=f_in_file) # Move to output location if urlparse(config.output_dir).scheme == 's3': job.fileStore.logToMaster('Uploading {} to S3: {}'.format( config.uuid, config.output_dir)) s3am_upload(fpath=out_tar, s3_dir=config.output_dir, num_cores=config.cores) else: job.fileStore.logToMaster('Moving {} to output dir: {}'.format( config.uuid, config.output_dir)) mkdir_p(config.output_dir) copy_files( file_paths=[os.path.join(work_dir, config.uuid + '.tar.gz')], output_dir=config.output_dir)
def consolidate_output(job, config, mutect, pindel, muse): """ Combine the contents of separate tarball outputs into one via streaming :param JobFunctionWrappingJob job: passed automatically by Toil :param Namespace config: Argparse Namespace object containing argument inputs :param str mutect: MuTect tarball FileStoreID :param str pindel: Pindel tarball FileStoreID :param str muse: MuSe tarball FileStoreID """ work_dir = job.fileStore.getLocalTempDir() mutect_tar, pindel_tar, muse_tar = None, None, None if mutect: mutect_tar = job.fileStore.readGlobalFile(mutect, os.path.join(work_dir, 'mutect.tar.gz')) if pindel: pindel_tar = job.fileStore.readGlobalFile(pindel, os.path.join(work_dir, 'pindel.tar.gz')) if muse: muse_tar = job.fileStore.readGlobalFile(muse, os.path.join(work_dir, 'muse.tar.gz')) out_tar = os.path.join(work_dir, config.uuid + '.tar.gz') # Consolidate separate tarballs into one as streams (avoids unnecessary untaring) tar_list = [x for x in [mutect_tar, pindel_tar, muse_tar] if x is not None] with tarfile.open(os.path.join(work_dir, out_tar), 'w:gz') as f_out: for tar in tar_list: with tarfile.open(tar, 'r') as f_in: for tarinfo in f_in: with closing(f_in.extractfile(tarinfo)) as f_in_file: if tar is mutect_tar: tarinfo.name = os.path.join(config.uuid, 'mutect', os.path.basename(tarinfo.name)) elif tar is pindel_tar: tarinfo.name = os.path.join(config.uuid, 'pindel', os.path.basename(tarinfo.name)) else: tarinfo.name = os.path.join(config.uuid, 'muse', os.path.basename(tarinfo.name)) f_out.addfile(tarinfo, fileobj=f_in_file) # Move to output location if urlparse(config.output_dir).scheme == 's3': job.fileStore.logToMaster('Uploading {} to S3: {}'.format(config.uuid, config.output_dir)) s3am_upload(job=job, fpath=out_tar, s3_dir=config.output_dir, num_cores=config.cores) else: job.fileStore.logToMaster('Moving {} to output dir: {}'.format(config.uuid, config.output_dir)) mkdir_p(config.output_dir) copy_files(file_paths=[out_tar], output_dir=config.output_dir)
def output_file_job(job, filename, file_id, output_dir, s3_key_path=None): """ Uploads a file from the FileStore to an output directory on the local filesystem or S3. :param JobFunctionWrappingJob job: passed automatically by Toil :param str filename: basename for file :param str file_id: FileStoreID :param str output_dir: Amazon S3 URL or local path :param str s3_key_path: (OPTIONAL) Path to 32-byte key to be used for SSE-C encryption :return: """ job.fileStore.logToMaster('Writing {} to {}'.format(filename, output_dir)) work_dir = job.fileStore.getLocalTempDir() filepath = job.fileStore.readGlobalFile(file_id, os.path.join(work_dir, filename)) if urlparse(output_dir).scheme == 's3': s3am_upload(job=job, fpath=os.path.join(work_dir, filepath), s3_dir=output_dir, s3_key_path=s3_key_path) elif os.path.exists(os.path.join(output_dir, filename)): job.fileStore.logToMaster("File already exists: {}".format(filename)) else: mkdir_p(output_dir) copy_files([filepath], output_dir)
def output_file_job(job, filename, file_id, output_dir, s3_key_path=None): """ Uploads a file from the FileStore to an output directory on the local filesystem or S3. :param JobFunctionWrappingJob job: passed automatically by Toil :param str filename: basename for file :param str file_id: FileStoreID :param str output_dir: Amazon S3 URL or local path :param str s3_key_path: (OPTIONAL) Path to 32-byte key to be used for SSE-C encryption :return: """ job.fileStore.logToMaster('Writing {} to {}'.format(filename, output_dir)) work_dir = job.fileStore.getLocalTempDir() filepath = job.fileStore.readGlobalFile(file_id, os.path.join(work_dir, filename)) if urlparse(output_dir).scheme == 's3': s3am_upload(fpath=os.path.join(work_dir, filepath), s3_dir=output_dir, s3_key_path=s3_key_path) elif os.path.exists(os.path.join(output_dir, filename)): job.fileStore.logToMaster("File already exists: {}".format(filename)) else: mkdir_p(output_dir) copy_files([filepath], output_dir)
def consolidate_output(job, config, chunk_infos): #prep start = time.time() uuid = config.uuid work_dir = job.fileStore.getLocalTempDir() out_tar = os.path.join(work_dir, '{}.tar.gz'.format(config.uuid)) log(job, "{}".format(datetime.datetime.now()), uuid, 'consolidate_output') log(job, "consolidating {} files".format(len(chunk_infos)), uuid, 'consolidate_output') # build tarball out_tars = [out_tar] output_file_count = 0 with tarfile.open(out_tar, 'w:gz') as f_out: for ci in chunk_infos: file_id = ci[CI_OUTPUT_FILE_ID] tar_file = os.path.join(work_dir, "{}.tar.gz".format(ci[CI_CHUNK_INDEX])) job.fileStore.readGlobalFile(file_id, tar_file) out_tars.append(tar_file) with tarfile.open(tar_file, 'r') as f_in: for tarinfo in f_in: if config.minimal_output and ( (tarinfo.name.endswith("bam") or tarinfo.name.endswith("sam") or tarinfo.name.endswith("bai")) and ID_MERGED not in tarinfo.name): log( job, "(Minimal Output) Skipping output file: {}".format( tarinfo.name), uuid, 'consolidate_output') continue if config.minimal_cpecan_output and tarinfo.name.endswith( "gz"): log( job, "(Minimal cPecan Output) Skipping output file: {}". format(tarinfo.name), uuid, 'consolidate_output') continue log(job, "file {}".format(tarinfo.name), uuid, 'consolidate_output') with closing(f_in.extractfile(tarinfo)) as f_in_file: f_out.addfile(tarinfo, fileobj=f_in_file) output_file_count += 1 log( job, "Consolidated {} files in {} tarballs".format(output_file_count, len(out_tars)), uuid, 'consolidate_output') # Move to output location if urlparse(config.output_dir).scheme == 's3': log(job, "Uploading {} to S3: {}".format(out_tar, config.output_dir), uuid, 'consolidate_output') s3am_upload(fpath=out_tar, s3_dir=config.output_dir, num_cores=config.maxCores) else: log(job, "Moving {} to output dir: {}".format(out_tar, config.output_dir), uuid, 'consolidate_output') mkdir_p(config.output_dir) copy_files(file_paths=[out_tar], output_dir=config.output_dir) # log log_generic_job_debug(job, config.uuid, "consolidate_output", work_dir=work_dir) log_time(job, "consolidate_output", start, config.uuid) log(job, "{}".format(datetime.datetime.now()), uuid, 'END') # return location (calculated the same whether s3:// or file:// return os.path.join(config.output_dir, os.path.basename(out_tar))
def prepare_input(job, sample, config, enqueue_consolidation=True): # job prep config = argparse.Namespace(**vars(config)) uuid, url, contig_name, reference_url, params_url = sample config.uuid = uuid config.contig_name = contig_name config.reference_url = reference_url config.params_url = params_url if config.intermediate_file_location is not None: config.intermediate_file_location = os.path.join( config.intermediate_file_location, uuid) mkdir_p(config.intermediate_file_location) work_dir = job.fileStore.getLocalTempDir() start = time.time() log(job, "{}".format(datetime.datetime.now()), config.uuid, 'START') log( job, "Preparing input with URL:{}, contig:{}, reference_url:{}, params_url:{}" .format(url, contig_name, reference_url, params_url), uuid, 'prepare_input') # todo global resource estimation config.maxCores = min(config.maxCores, multiprocessing.cpu_count()) config.defaultCores = min(MP_CPU, config.maxCores) config.maxMemory = min(config.maxMemory, int(physicalMemory() * .95)) #config.disk # download references - TOIL_JOBSTORE_PROTOCOL queries are so this function can be imported #ref fasta if reference_url.startswith(TOIL_JOBSTORE_PROTOCOL): ref_genome_fileid = reference_url.replace(TOIL_JOBSTORE_PROTOCOL, '', 1) ref_genome_filename = "{}.reference.{}.fa".format(uuid, contig_name) job.fileStore.readGlobalFile( ref_genome_fileid, os.path.join(work_dir, ref_genome_filename)) else: download_url(reference_url, work_dir=work_dir) ref_genome_filename = os.path.basename(reference_url) ref_genome_fileid = job.fileStore.writeGlobalFile( os.path.join(work_dir, ref_genome_filename)) ref_genome_size = os.stat(os.path.join(work_dir, ref_genome_filename)).st_size config.reference_genome_fileid = ref_genome_fileid #params if params_url.startswith(TOIL_JOBSTORE_PROTOCOL): params_fileid = params_url.replace(TOIL_JOBSTORE_PROTOCOL, '', 1) else: download_url(params_url, work_dir=work_dir) params_filename = os.path.basename(params_url) params_fileid = job.fileStore.writeGlobalFile( os.path.join(work_dir, params_filename)) config.params_fileid = params_fileid # download bam if url.startswith(TOIL_JOBSTORE_PROTOCOL): bam_filename = "{}.input.{}.bam".format(uuid, contig_name) job.fileStore.readGlobalFile( url.replace(TOIL_JOBSTORE_PROTOCOL, '', 1), os.path.join(work_dir, bam_filename)) else: download_url(url, work_dir=work_dir) bam_filename = os.path.basename(url) data_bam_location = os.path.join("/data", bam_filename) workdir_bam_location = os.path.join(work_dir, bam_filename) # index the bam _index_bam(job, config, work_dir, bam_filename) # sanity check workdir_bai_location = os.path.join(work_dir, bam_filename + ".bai") if not os.path.isfile(workdir_bai_location): raise UserError("BAM index file not created for {}: {}".format( bam_filename, workdir_bai_location)) # get start and end location start_idx = sys.maxint end_idx = 0 with closing( pysam.AlignmentFile( workdir_bam_location, 'rb' if bam_filename.endswith("bam") else 'r')) as aln: for read in aln.fetch(): align_start = read.reference_start align_end = read.reference_end start_idx = min([start_idx, align_start]) end_idx = max([end_idx, align_end]) log(job, "start_pos:{}, end_pos:{}".format(config.uuid, start_idx, end_idx), uuid, 'prepare_input') # get reads from positions chunk_infos = list() idx = start_idx while idx < end_idx: ci = {CI_UUID: uuid} ci[CI_CHUNK_BOUNDARY_START] = idx chunk_start = idx - config.partition_margin ci[CI_CHUNK_START] = chunk_start idx += config.partition_size ci[CI_CHUNK_BOUNDARY_END] = idx chunk_end = idx + config.partition_margin ci[CI_CHUNK_END] = chunk_end chunk_infos.append(ci) # enqueue jobs log(job, "Enqueueing {} jobs".format(len(chunk_infos)), uuid, 'prepare_input') idx = 0 enqueued_jobs = 0 returned_tarballs = list() for ci in chunk_infos: #prep ci[CI_CHUNK_INDEX] = idx chunk_start = ci[CI_CHUNK_START] chunk_end = ci[CI_CHUNK_END] chunk_position_description = "{}:{}-{}".format(config.contig_name, chunk_start, chunk_end) bam_split_command = [ "view", "-b", data_bam_location, chunk_position_description ] chunk_name = "{}.{}.bam".format(config.uuid, idx) #write chunk chunk_location = os.path.join(work_dir, chunk_name) with open(chunk_location, 'w') as out: docker_call(job, config, work_dir, bam_split_command, DOCKER_SAMTOOLS_IMG, DOCKER_SAMTOOLS_TAG, outfile=out) #document read count chunk_size = os.stat(chunk_location).st_size ci[CI_CHUNK_SIZE] = chunk_size ci[CI_REF_FA_SIZE] = ref_genome_size read_count = prepare_input__get_bam_read_count(job, work_dir, chunk_name) ci[CI_READ_COUNT] = read_count log( job, "chunk from {} for idx {} is {}b ({}mb) and has {} reads".format( chunk_position_description, idx, chunk_size, int(chunk_size / 1024 / 1024), read_count), uuid, 'prepare_input') if config.intermediate_file_location is not None: copy_files(file_paths=[chunk_location], output_dir=config.intermediate_file_location) # enqueue marginPhase job if read_count > 0: chunk_fileid = job.fileStore.writeGlobalFile(chunk_location) mp_cores = config.defaultCores mp_mem = int( min( int(chunk_size * MP_MEM_BAM_FACTOR + ref_genome_size * MP_MEM_REF_FACTOR), config.maxMemory)) mp_disk = int( min( int(chunk_size * MP_DSK_BAM_FACTOR + ref_genome_size * MP_DSK_REF_FACTOR + (0 if config.cpecan_probabilities else MP_DSK_CPECAN_FACTOR) * chunk_size), config.maxDisk)) log( job, "requesting {} cores, {}b ({}mb) disk, {}b ({}gb) mem".format( mp_cores, mp_disk, int(mp_disk / 1024 / 1024), mp_mem, int(mp_mem / 1024 / 1024 / 1024)), "{}.{}".format(uuid, idx), 'prepare_input') mp_mem = str(int(mp_mem / 1024)) + "K" mp_disk = str(int(mp_disk) / 1024) + "K" margin_phase_job = job.addChildJobFn(run_margin_phase, config, chunk_fileid, ci, memory=mp_mem, cores=mp_cores, disk=mp_disk) returned_tarballs.append(margin_phase_job.rv()) enqueued_jobs += 1 idx += 1 log(job, "Enqueued {} jobs".format(enqueued_jobs), uuid, 'prepare_input') # enqueue merging and consolidation job merge_job = job.addFollowOnJobFn(merge_chunks, config, returned_tarballs) final_return_value = merge_job.rv() if enqueue_consolidation: consolidation_job = merge_job.addFollowOnJobFn(consolidate_output, config, merge_job.rv()) final_return_value = consolidation_job.rv() # log log_generic_job_debug(job, config.uuid, 'prepare_input', work_dir=work_dir) log_time(job, "prepare_input", start, config.uuid) # return appropriate output return final_return_value
def run_margin_phase(job, config, chunk_file_id, chunk_info): # prep start = time.time() work_dir = job.fileStore.getLocalTempDir() chunk_idx = chunk_info[CI_CHUNK_INDEX] chunk_identifier = "{}.{}".format(config.uuid, chunk_idx) chunk_name = "{}.in.bam".format(chunk_identifier) chunk_location = os.path.join(work_dir, chunk_name) log(job, str(datetime.datetime.now()), chunk_identifier, 'run_margin_phase') # download bam chunk job.fileStore.readGlobalFile(chunk_file_id, chunk_location) if not os.path.isfile(chunk_location): raise UserError("Failed to download chunk {} from {}".format( chunk_name, chunk_file_id)) # download references #ref genome genome_reference_name = "reference.fa" genome_reference_location = os.path.join(work_dir, genome_reference_name) job.fileStore.readGlobalFile(config.reference_genome_fileid, genome_reference_location) if not os.path.isfile(genome_reference_location): raise UserError( "Failed to download genome reference {} from {}".format( os.path.basename(config.reference_genome), config.reference_genome_fileid)) # params params_name = "params.json" params_location = os.path.join(work_dir, params_name) job.fileStore.readGlobalFile(config.params_fileid, params_location) if not os.path.isfile(params_location): raise UserError("Failed to download params {} from {}".format( os.path.basename(config.params), config.params_fileid)) # do we want to run cPecan? cpecan_prob_location = None if config.cpecan_probabilities: cpecan_prob_location = run_margin_phase__run_cpecan_alignment( job, config, chunk_identifier, work_dir, chunk_name, genome_reference_name) # run marginPhase params = [ os.path.join("/data", chunk_name), os.path.join("/data", genome_reference_name), os.path.join("/data", params_name), "-o", os.path.join("/data", "{}.out".format(chunk_identifier)), '--tag', "{},{}-{}".format(chunk_idx, chunk_info[CI_CHUNK_BOUNDARY_START], chunk_info[CI_CHUNK_BOUNDARY_END]) ] if cpecan_prob_location is not None: params.extend([ '--singleNuclProbDir', os.path.join("/data", cpecan_prob_location) ]) docker_call(job, config, work_dir, params, config.margin_phase_image, config.margin_phase_tag) log_debug_from_docker(job, os.path.join(work_dir, DOCKER_MARGIN_PHASE_LOG), chunk_identifier, 'margin_phase', [chunk_location, genome_reference_location]) log_location = os.path.join(work_dir, "marginPhase.{}.log".format(chunk_identifier)) os.rename(os.path.join(work_dir, DOCKER_MARGIN_PHASE_LOG), log_location) # document output log(job, "Output files after marginPhase:", chunk_identifier, 'run_margin_phase') output_file_locations = glob.glob( os.path.join(work_dir, "{}*".format(chunk_identifier))) output_file_locations.append(log_location) found_vcf, found_sam = False, False for f in output_file_locations: log(job, "\t\t{}".format(os.path.basename(f)), chunk_identifier, 'run_margin_phase') if f.endswith(VCF_SUFFIX): found_vcf = True if f.endswith(SAM_UNIFIED_SUFFIX): found_sam = True if cpecan_prob_location is not None: cpecan_tarball = glob.glob( os.path.join(work_dir, cpecan_prob_location, "*.tar.gz")) if len(cpecan_tarball) == 0: # todo why has tarball_files failed in this location? log(job, "Found no cpecan output tarball! Trying alt location.", chunk_identifier, 'run_margin_phase') cpecan_tarball = glob.glob(os.path.join(work_dir, "*.tar.gz")) if len(cpecan_tarball) == 0: log(job, "Found no cpecan output tarball!", chunk_identifier, 'run_margin_phase') elif len(cpecan_tarball) > 1: log( job, "Found {} cpecan output tarballs: {}".format( len(cpecan_tarball), cpecan_tarball), chunk_identifier, 'run_margin_phase') else: log(job, "Saving cpecan output tarball: {}".format(cpecan_tarball[0]), chunk_identifier, 'run_margin_phase') output_file_locations.append(cpecan_tarball[0]) # tarball the output and save tarball_name = "{}.tar.gz".format(chunk_identifier) tarball_files(tar_name=tarball_name, file_paths=output_file_locations, output_dir=work_dir) # validate output, retry if not if not (found_sam and found_vcf): if "retry_attempts" not in config: config.retry_attempts = 1 else: config.retry_attempts += 1 if config.retry_attempts > MAX_RETRIES: log(job, "", chunk_identifier, 'run_margin_phase') error = "Failed to generate appropriate output files {} times".format( MAX_RETRIES) log(job, error, chunk_identifier, 'run_margin_phase') # this enables us to "recover" in the face of failure during a run if CONTINUE_AFTER_FAILURE: output_file_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, tarball_name)) chunk_info[CI_OUTPUT_FILE_ID] = output_file_id return chunk_info raise UserError("{}:{}".format(chunk_identifier, error)) log( job, "Missing output files. Attepmting retry {}".format( config.retry_attempts), chunk_identifier, 'run_margin_phase') log(job, "Failed job log file:", chunk_identifier, 'run_margin_phase') log(job, "", chunk_identifier, 'run_margin_phase') with open(log_location, 'r') as input: for line in input: log(job, "\t\t{}".format(line.rstrip()), chunk_identifier, 'run_margin_phase') # new job retry_job = job.addChildJobFn( run_margin_phase, config, chunk_file_id, chunk_info, memory=str(int(config.maxMemory / 1024)) + "K", cores=job.cores, disk=job.disk) # save failed output if config.intermediate_file_location is not None: tarball_fail_name = "{}.FAILURE.{}.tar.gz".format( chunk_identifier, config.retry_attempts) os.rename(os.path.join(work_dir, tarball_name), os.path.join(work_dir, tarball_fail_name)) copy_files(file_paths=[os.path.join(work_dir, tarball_fail_name)], output_dir=config.intermediate_file_location) log_generic_job_debug(job, config.uuid, 'run_margin_phase', work_dir=work_dir) return retry_job.rv() # if successfull, save output if config.intermediate_file_location is not None: copy_files(file_paths=[os.path.join(work_dir, tarball_name)], output_dir=config.intermediate_file_location) output_file_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, tarball_name)) chunk_info[CI_OUTPUT_FILE_ID] = output_file_id # log log_generic_job_debug(job, config.uuid, 'run_margin_phase', work_dir=work_dir) log_time(job, "run_margin_phase", start, chunk_identifier) return chunk_info
def consolidate_output(job, config, kallisto_output, rsem_star_output, fastqc_output): """ Combines the contents of the outputs into one tarball and places in output directory or s3 :param JobFunctionWrappingJob job: passed automatically by Toil :param Namespace config: Argparse Namespace object containing argument inputs :param FileID kallisto_output: FileStoreID for Kallisto output :param tuple(FileID, FileID, FileID)|tuple(FileID, FileID, FileID, bool, FileID) rsem_star_output: FileStoreIDs for RSEM and STAR output, and a flag/FileID if run with bamQC :param FileID fastqc_output: FileStoreID for FastQC output """ job.fileStore.logToMaster('Consolidating output: {}'.format(config.uuid)) work_dir = job.fileStore.getLocalTempDir() config.uuid = 'SINGLE-END.' + config.uuid if not config.paired else config.uuid # Retrieve output file paths to consolidate rsem_tar, hugo_tar, kallisto_tar, fastqc_tar, bamqc_tar, star_tar = None, None, None, None, None, None if rsem_star_output: if config.bamqc: rsem_id, hugo_id, star_id, fail_flag, bamqc_id = flatten( rsem_star_output) bamqc_tar = job.fileStore.readGlobalFile( bamqc_id, os.path.join(work_dir, 'bamqc.tar.gz')) config.uuid = 'FAIL.' + config.uuid if fail_flag else config.uuid else: rsem_id, hugo_id, star_id = flatten(rsem_star_output) rsem_tar = job.fileStore.readGlobalFile( rsem_id, os.path.join(work_dir, 'rsem.tar.gz')) hugo_tar = job.fileStore.readGlobalFile( hugo_id, os.path.join(work_dir, 'rsem_hugo.tar.gz')) star_tar = job.fileStore.readGlobalFile( star_id, os.path.join(work_dir, 'star.tar.gz')) if kallisto_output: kallisto_tar = job.fileStore.readGlobalFile( kallisto_output, os.path.join(work_dir, 'kallisto.tar.gz')) if fastqc_output: fastqc_tar = job.fileStore.readGlobalFile( fastqc_output, os.path.join(work_dir, 'fastqc.tar.gz')) # I/O out_tar = os.path.join(work_dir, config.uuid + '.tar.gz') # Consolidate separate tarballs into one as streams (avoids unnecessary untaring) tar_list = [ x for x in [rsem_tar, hugo_tar, kallisto_tar, fastqc_tar, bamqc_tar, star_tar] if x is not None ] with tarfile.open(out_tar, 'w:gz') as f_out: for tar in tar_list: with tarfile.open(tar, 'r') as f_in: for tarinfo in f_in: with closing(f_in.extractfile(tarinfo)) as f_in_file: if tar == rsem_tar: tarinfo.name = os.path.join( config.uuid, 'RSEM', os.path.basename(tarinfo.name)) elif tar == hugo_tar: tarinfo.name = os.path.join( config.uuid, 'RSEM', 'Hugo', os.path.basename(tarinfo.name)) elif tar == kallisto_tar: tarinfo.name = os.path.join( config.uuid, 'Kallisto', os.path.basename(tarinfo.name)) elif tar == bamqc_tar: tarinfo.name = os.path.join( config.uuid, 'QC', 'bamQC', os.path.basename(tarinfo.name)) elif tar == fastqc_tar: tarinfo.name = os.path.join( config.uuid, 'QC', 'fastQC', os.path.basename(tarinfo.name)) elif tar == star_tar: tarinfo.name = os.path.join( config.uuid, 'QC', 'STAR', os.path.basename(tarinfo.name)) f_out.addfile(tarinfo, fileobj=f_in_file) # Move to output location if urlparse(config.output_dir).scheme == 's3': job.fileStore.logToMaster('Uploading {} to S3: {}'.format( config.uuid, config.output_dir)) s3am_upload(fpath=out_tar, s3_dir=config.output_dir, num_cores=config.cores) else: job.fileStore.logToMaster('Moving {} to output dir: {}'.format( config.uuid, config.output_dir)) mkdir_p(config.output_dir) copy_files( file_paths=[os.path.join(work_dir, config.uuid + '.tar.gz')], output_dir=config.output_dir)
def download_run_and_upload(job, master_ip, inputs, spark_on_toil): """ Monolithic job that calls data download, conversion, call, upload. """ master_ip = MasterAddress(master_ip) bam_name = inputs.sample.split('://')[-1].split('/')[-1] sample_name = ".".join(os.path.splitext(bam_name)[:-1]) hdfs_subdir = sample_name + "-dir" if inputs.run_local: inputs.local_dir = job.fileStore.getLocalTempDir() hdfs_dir = inputs.local_dir else: inputs.local_dir = None hdfs_dir = "hdfs://{0}:{1}/{2}".format(master_ip, HDFS_MASTER_PORT, hdfs_subdir) try: hdfs_prefix = hdfs_dir + "/" + sample_name hdfs_bam = hdfs_dir + "/" + bam_name if not inputs.run_local: _log.info("Downloading input BAM %s to %s.", bam_name, hdfs_bam) call_conductor(job, master_ip, inputs.sample, hdfs_bam, container='fnothaft/conductor', memory=inputs.memory) else: copy_files([inputs.sample], inputs.local_dir) adam_input = hdfs_prefix + ".adam" _log.info("Converting input BAM to ADAM.") call_adam(job, master_ip, ["transform", hdfs_bam, adam_input], memory=inputs.memory, run_local=inputs.run_local, container='fnothaft/adam') avocado_output = hdfs_prefix + ".gt.adam" _log.info("Calling variants with avocado.") call_avocado( job, master_ip, ["biallelicGenotyper", "-is_not_grc", adam_input, avocado_output], memory=inputs.memory, container='fnothaft/avocado') output_vcf = hdfs_prefix + ".vcf" _log.info("Converting output ADAM Genotypes to VCF.") call_adam(job, master_ip, [ "adam2vcf", avocado_output, output_vcf, "-single", "-sort_on_save", "-stringency", "LENIENT" ], memory=inputs.memory, run_local=inputs.run_local, container='fnothaft/adam') out_file = inputs.output_dir + "/" + sample_name + inputs.suffix + ".bam" if not inputs.run_local: _log.info("Uploading output VCF %s to %s.", output_vcf, out_file) call_conductor(job, master_ip, output_vcf, out_file, memory=inputs.memory, container='fnothaft/conductor') remove_file(master_ip, output_vcf, spark_on_toil) else: local_adam_output = "%s/%s.processed.bam" % (inputs.local_dir, sample_name) move_files([local_adam_output], inputs.output_dir) remove_file(master_ip, hdfs_subdir, spark_on_toil) except: remove_file(master_ip, hdfs_subdir, spark_on_toil) raise
def download_run_and_upload(job, master_ip, inputs, spark_on_toil): """ Monolithic job that calls data download, conversion, call, upload. """ master_ip = MasterAddress(master_ip) fastq_name = inputs.sample.split('://')[-1].split('/')[-1] sample_name = ".".join(os.path.splitext(fastq_name)[:-1]) hdfs_subdir = sample_name + "-dir" if inputs.run_local: inputs.local_dir = job.fileStore.getLocalTempDir() hdfs_dir = inputs.local_dir else: inputs.local_dir = None hdfs_dir = "hdfs://{0}:{1}/{2}".format(master_ip, HDFS_MASTER_PORT, hdfs_subdir) try: hdfs_prefix = hdfs_dir + "/" + sample_name hdfs_reads = hdfs_dir + "/" + fastq_name if not inputs.run_local: _log.info("Downloading input reads %s to %s.", fastq_name, hdfs_reads) call_conductor(job, master_ip, inputs.sample, hdfs_reads, container='fnothaft/conductor', memory=inputs.memory) index_exts = ['', '.amb', '.ann', '.bwt', '.fai', '.pac', '.sa'] hdfs_index = os.path.join(hdfs_prefix, 'reference.fa') for ext in index_exts: index_path = inputs.index + ext hdfs_index_ext = hdfs_index + ext _log.info("Downloading index file %s to %s.", index_path, hdfs_index_ext) call_conductor(job, master_ip, index_path, hdfs_index_ext, container='fnothaft/conductor', memory=inputs.memory) sd_path = inputs.index.replace('.fa', '.dict') hdfs_sd = hdfs_index.replace('.fa', '.dict') _log.info("Downloading sequence dictionary %s to %s.", sd_path, hdfs_sd) call_conductor(job, master_ip, sd_path, hdfs_sd, container='fnothaft/conductor', memory=inputs.memory) else: copy_files([inputs.sample], inputs.local_dir) aligned_output = hdfs_prefix + ".bam" _log.info("Aligning reads with Cannoli and BWA.") call_cannoli(job, master_ip, [ "bwa", "-single", hdfs_reads, aligned_output, inputs.sample_id, '-use_docker', '-docker_image', 'fnothaft/bwa:debug-3', '-index', hdfs_index, '-add_indices', '-sequence_dictionary', hdfs_sd ], memory=inputs.memory, container='fnothaft/cannoli:1508-1509') out_file = inputs.output_dir + "/" + sample_name + inputs.suffix + ".bam" if not inputs.run_local: _log.info("Uploading output BAM %s to %s.", aligned_output, out_file) call_conductor(job, master_ip, aligned_output, out_file, memory=inputs.memory, container='fnothaft/conductor') remove_file(master_ip, output_vcf, spark_on_toil) else: local_adam_output = "%s/%s.bam" % (inputs.local_dir, sample_name) move_files([local_adam_output], inputs.output_dir) remove_file(master_ip, hdfs_subdir, spark_on_toil) except: remove_file(master_ip, hdfs_subdir, spark_on_toil) raise