def umi_transform(data): """ transform each read by identifying the barcode and UMI for each read and putting the information in the read name """ fq1 = data["files"][0] umi_dir = os.path.join(dd.get_work_dir(data), "umis") safe_makedir(umi_dir) transform = dd.get_umi_type(data) if not transform: logger.info("No UMI transform specified, assuming pre-transformed data.") if is_transformed(fq1): logger.info("%s detected as pre-transformed, passing it on unchanged." % fq1) data["files"] = [fq1] return data else: logger.error("No UMI transform was specified, but %s does not look " "pre-transformed. Assuming non-umi data." % fq1) return data if file_exists(transform): transform_file = transform else: transform_file = get_transform_file(transform) if not file_exists(transform_file): logger.error( "The UMI transform can be specified as either a file or a " "bcbio-supported transform. Either the file %s does not exist " "or the transform is not supported by bcbio. Supported " "transforms are %s." % (dd.get_umi_type(data), ", ".join(SUPPORTED_TRANSFORMS))) sys.exit(1) out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return data umis = config_utils.get_program("umis", data, default="umis") cores = dd.get_num_cores(data) # skip transformation if the file already looks transformed with open_fastq(fq1) as in_handle: read = next(in_handle) if "UMI_" in read: data["files"] = [out_file] return data cmd = ("{umis} fastqtransform {transform_file} " "--cores {cores} " "{fq1}" "| seqtk seq -L 20 - | gzip > {tx_out_file}") message = ("Inserting UMI and barcode information into the read name of %s" % fq1) with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) data["files"] = [out_file] return data
def umi_transform(data): """ transform each read by identifying the barcode and UMI for each read and putting the information in the read name """ fqfiles = data["files"] fqfiles.extend(list(repeat("", 4-len(fqfiles)))) fq1, fq2, fq3, fq4 = fqfiles umi_dir = os.path.join(dd.get_work_dir(data), "umis") safe_makedir(umi_dir) transform = dd.get_umi_type(data) if file_exists(transform): transform_file = transform else: transform_file = get_transform_file(transform) if not file_exists(transform_file): logger.error( "The UMI transform can be specified as either a file or a " "bcbio-supported transform. Either the file %s does not exist " "or the transform is not supported by bcbio. Supported " "transforms are %s." %(dd.get_umi_type(data), ", ".join(SUPPORTED_TRANSFORMS))) sys.exit(1) out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] cellular_barcodes = get_cellular_barcodes(data) if len(cellular_barcodes) > 1: split_option = "--separate_cb" else: split_option = "" umis = config_utils.get_program("umis", data, default="umis") cores = dd.get_num_cores(data) # skip transformation if the file already looks transformed with open_fastq(fq1) as in_handle: read = in_handle.next() if "UMI_" in read: data["files"] = [out_file] return [[data]] cmd = ("{umis} fastqtransform {split_option} {transform_file} " "--cores {cores} " "{fq1} {fq2} {fq3} {fq4}" "| seqtk seq -L 20 - | gzip > {tx_out_file}") message = ("Inserting UMI and barcode information into the read name of %s" % fq1) with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]
def umi_transform(data): """ transform each read by identifying the barcode and UMI for each read and putting the information in the read name """ fqfiles = data["files"] fqfiles.extend(list(repeat("", 4 - len(fqfiles)))) fq1, fq2, fq3, fq4 = fqfiles umi_dir = os.path.join(dd.get_work_dir(data), "umis") safe_makedir(umi_dir) transform = dd.get_umi_type(data) if file_exists(transform): transform_file = transform else: transform_file = get_transform_file(transform) if not file_exists(transform_file): logger.error( "The UMI transform can be specified as either a file or a " "bcbio-supported transform. Either the file %s does not exist " "or the transform is not supported by bcbio. Supported " "transforms are %s." % (dd.get_umi_type(data), ", ".join(SUPPORTED_TRANSFORMS))) sys.exit(1) out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] cellular_barcodes = get_cellular_barcodes(data) if len(cellular_barcodes) > 1: split_option = "--separate_cb" else: split_option = "" umis = config_utils.get_program("umis", data, default="umis") cores = dd.get_num_cores(data) # skip transformation if the file already looks transformed with open_fastq(fq1) as in_handle: read = in_handle.next() if "UMI_" in read: data["files"] = [out_file] return [[data]] cmd = ("{umis} fastqtransform {split_option} {transform_file} " "--cores {cores} " "{fq1} {fq2} {fq3} {fq4}" "| seqtk seq -L 20 - | gzip > {tx_out_file}") message = ( "Inserting UMI and barcode information into the read name of %s" % fq1) with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]
def umi_transform(data): """ transform each read by identifying the barcode and UMI for each read and putting the information in the read name """ fq1, fq2 = dd.get_input_sequence_files(data) fq2 = fq2 if fq2 else "" umi_dir = os.path.join(dd.get_work_dir(data), "umis") transform = dd.get_umi_type(data) transform_data = transforms[transform] safe_makedir(umi_dir) transform_file = os.path.join(umi_dir, transform + ".json") transform_file = write_transform_file(transform_data, transform_file) out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] index_option = "--dual_index" if transform_data["dual"] else "" if len(dd.get_cellular_barcodes(data)) == 2: split_option = "--separate_cb" else: split_option = "" umis = config_utils.get_program("umis", data, default="umis") cmd = ( "{umis} fastqtransform {index_option} {split_option} {transform_file} " "{fq1} {fq2} " "| seqtk seq -L 20 - | gzip > {tx_out_file}") message = ( "Inserting UMI and barcode information into the read name of %s" % fq1) with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]
def umi_transform(data): """ transform each read by identifying the barcode and UMI for each read and putting the information in the read name """ fq1, fq2 = dd.get_input_sequence_files(data) fq2 = fq2 if fq2 else "" umi_dir = os.path.join(dd.get_work_dir(data), "umis") transform = dd.get_umi_type(data) transform_data = transforms[transform] safe_makedir(umi_dir) transform_file = os.path.join(umi_dir, transform + ".json") transform_file = write_transform_file(transform_data, transform_file) out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] index_option = "--dual_index" if transform_data["dual"] else "" if len(dd.get_cellular_barcodes(data)) == 2: split_option = "--separate_cb" else: split_option = "" umis = config_utils.get_program("umis", data, default="umis") cmd = ("{umis} fastqtransform {index_option} {split_option} {transform_file} " "{fq1} {fq2} " "| seqtk seq -L 20 - | gzip > {tx_out_file}") message = ("Inserting UMI and barcode information into the read name of %s" % fq1) with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]
def _run_somatic(paired, ref_file, target, out_file): """Run somatic calling with octopus, handling both paired and tumor-only cases. Tweaks for low frequency, tumor only and UMI calling documented in: https://github.com/luntergroup/octopus/blob/develop/configs/UMI.config """ align_bams = paired.tumor_bam if paired.normal_bam: align_bams += " %s --normal-sample %s" % (paired.normal_bam, paired.normal_name) cores = dd.get_num_cores(paired.tumor_data) # Do not try to search below 0.4% currently as leads to long runtimes # https://github.com/luntergroup/octopus/issues/29#issuecomment-428167979 min_af = max([float(dd.get_min_allele_fraction(paired.tumor_data)) / 100.0, 0.004]) min_af_floor = min_af / 4.0 cmd = ("octopus --threads {cores} --reference {ref_file} --reads {align_bams} " "--regions-file {target} " "--min-credible-somatic-frequency {min_af_floor} --min-expected-somatic-frequency {min_af} " "--downsample-above 4000 --downsample-target 4000 --min-kmer-prune 5 --min-bubble-score 20 " "--max-haplotypes 200 --somatic-snv-mutation-rate '5e-4' --somatic-indel-mutation-rate '1e-05' " "--target-working-memory 5G --target-read-buffer-footprint 5G --max-somatic-haplotypes 3 " "--caller cancer " "--working-directory {tmp_dir} " "-o {tx_out_file} --legacy") if not paired.normal_bam: cmd += (" --tumour-germline-concentration 5") if dd.get_umi_type(paired.tumor_data) or _is_umi_consensus_bam(paired.tumor_bam): cmd += (" --allow-octopus-duplicates --overlap-masking 0 " "--somatic-filter-expression 'GQ < 200 | MQ < 30 | SB > 0.2 | SD[.25] > 0.1 " "| BQ < 40 | DP < 100 | MF > 0.1 | AD < 5 | CC > 1.1 | GQD > 2'") with file_transaction(paired.tumor_data, out_file) as tx_out_file: tmp_dir = os.path.dirname(tx_out_file) do.run(cmd.format(**locals()), "Octopus somatic calling") _produce_compatible_vcf(tx_out_file, paired.tumor_data, is_somatic=True) return out_file
def get_cellular_barcodes(data): if dd.get_cellular_barcodes(data): return dd.get_cellular_barcodes(data) if is_supported_transform(data): stem = dd.get_umi_type(data) bc1 = os.path.join(TRANSFORM_DIR, stem + "-cb1.txt") bc2 = os.path.join(TRANSFORM_DIR, stem + "-cb2.txt") return filter(file_exists, [bc1, bc2]) else: return []
def get_cellular_barcodes(data): if dd.get_cellular_barcodes(data): return dd.get_cellular_barcodes(data) if is_supported_transform(data): stem = dd.get_umi_type(data) bc1 = os.path.join(TRANSFORM_DIR, stem + "-cb1.txt") bc2 = os.path.join(TRANSFORM_DIR, stem + "-cb2.txt") bc3 = os.path.join(TRANSFORM_DIR, stem + "-cb3.txt") return filter(file_exists, [bc1, bc2, bc3]) else: return []
def create_inputs(data): """Index input reads and prepare groups of reads to process concurrently. Allows parallelization of alignment beyond processors available on a single machine. Prepares a bgzip and grabix indexed file for retrieving sections of files. """ from bcbio.pipeline import sample data = cwlutils.normalize_missing(data) aligner = tz.get_in(("config", "algorithm", "aligner"), data) # CRAM files must be converted to bgzipped fastq, unless not aligning. # Also need to prep and download remote files. if not ("files" in data and data["files"] and aligner and (_is_cram_input(data["files"]) or objectstore.is_remote(data["files"][0]))): # skip indexing on samples without input files or not doing alignment if ("files" not in data or not data["files"] or data["files"][0] is None or not aligner): return [[data]] # if this is a DRAGEN BAM, we need to do further alignments with this BAM, so don't convert it if dd.get_umi_type(data) == "dragen": return [[data]] data["files_orig"] = data["files"] data["files"] = prep_fastq_inputs(data["files"], data) # preparation converts illumina into sanger format data["config"]["algorithm"]["quality_format"] = "standard" # Handle any necessary trimming data = utils.to_single_data(sample.trim_sample(data)[0]) _prep_grabix_indexes(data["files"], data) data = _set_align_split_size(data) out = [] if tz.get_in(["config", "algorithm", "align_split_size"], data): splits = _find_read_splits( data["files"][0], int(data["config"]["algorithm"]["align_split_size"])) for split in splits: cur_data = copy.deepcopy(data) cur_data["align_split"] = split out.append([cur_data]) else: out.append([data]) if "output_cwl_keys" in data: out = cwlutils.samples_to_records( [utils.to_single_data(x) for x in out], ["files", "align_split", "config__algorithm__quality_format"]) return out
def umi_transform(data): """ transform each read by identifying the barcode and UMI for each read and putting the information in the read name """ fqfiles = data["files"] fqfiles.extend(list(repeat("", 4-len(fqfiles)))) fq1, fq2, fq3, fq4 = fqfiles umi_dir = os.path.join(dd.get_work_dir(data), "umis") safe_makedir(umi_dir) transform = dd.get_umi_type(data) if file_exists(transform): transform_file = transform else: transform_data = transforms[transform] transform_file = os.path.join(umi_dir, transform + ".json") transform_file = write_transform_file(transform_data, transform_file) out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] if len(dd.get_cellular_barcodes(data)) == 2: split_option = "--separate_cb" else: split_option = "" umis = config_utils.get_program("umis", data, default="umis") cores = dd.get_num_cores(data) # skip transformation if the file already looks transformed with open_fastq(fq1) as in_handle: read = in_handle.next() if "UMI_" in read: data["files"] = [out_file] return [[data]] cmd = ("{umis} fastqtransform {split_option} {transform_file} " "--cores {cores} " "{fq1} {fq2} {fq3} {fq4}" "| seqtk seq -L 20 - | gzip > {tx_out_file}") message = ("Inserting UMI and barcode information into the read name of %s" % fq1) with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]
def umi_consensus(data): """Convert UMI grouped reads into fastq pair for re-alignment. """ align_bam = dd.get_work_bam(data) if dd.get_umi_type(data) == "dragen": umi_method = "adjacency" umi_tag = "RX" else: umi_method, umi_tag = _check_umi_type(align_bam) base_name = utils.splitext_plus(align_bam)[0] f1_out = f"{base_name}-cumi-1.fq.gz" f2_out = f"{base_name}-cumi-2.fq.gz" f_family_size_histogram = f"{base_name}.family_size_histogram.tsv" avg_coverage = coverage.get_average_coverage("rawumi", dd.get_variant_regions(data), data) fgbio = config_utils.get_program("fgbio", data["config"]) bamtofastq = config_utils.get_program("bamtofastq", data["config"]) if not utils.file_uptodate(f1_out, align_bam): with file_transaction(data, f1_out, f2_out, f_family_size_histogram) as (tx_f1_out, tx_f2_out, tx_fhist_out): jvm_opts = _get_fgbio_jvm_opts(data, os.path.dirname(tx_f1_out), 2) # Improve speeds by avoiding compression read/write bottlenecks io_opts = "--async-io=true --compression=0" est_options = _estimate_fgbio_defaults(avg_coverage) group_opts, cons_opts, filter_opts = _get_fgbio_options(data, est_options, umi_method) cons_method = "CallDuplexConsensusReads" if umi_method == "paired" else "CallMolecularConsensusReads" tempfile = "%s-bamtofastq-tmp" % utils.splitext_plus(f1_out)[0] ref_file = dd.get_ref_file(data) cmd = ("unset JAVA_HOME && " "{fgbio} {jvm_opts} {io_opts} GroupReadsByUmi {group_opts} -t {umi_tag} -s {umi_method} " "-i {align_bam} -f {tx_fhist_out} | " "{fgbio} {jvm_opts} {io_opts} {cons_method} {cons_opts} --sort-order=:none: " "-i /dev/stdin -o /dev/stdout | " "{fgbio} {jvm_opts} {io_opts} FilterConsensusReads {filter_opts} -r {ref_file} " "-i /dev/stdin -o /dev/stdout | " "{bamtofastq} collate=1 T={tempfile} F={tx_f1_out} F2={tx_f2_out} tags=cD,cM,cE gz=1") do.run(cmd.format(**locals()), "UMI consensus fastq generation") return f1_out, f2_out, avg_coverage
def is_supported_transform(data): return dd.get_umi_type(data) in SUPPORTED_TRANSFORMS
def is_precollapsed_bam(data): return dd.get_umi_type(data) == "fastq_name" and not has_umi(data)
def process_alignment(data, alt_input=None): """Do an alignment of fastq files, preparing a sorted BAM output file. """ data = cwlutils.normalize_missing(utils.to_single_data(data)) data = cwlutils.unpack_tarballs(data, data) fastq1, fastq2 = dd.get_input_sequence_files(data) if alt_input: fastq1, fastq2 = alt_input config = data["config"] aligner = config["algorithm"].get("aligner", None) if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner: if dd.get_umi_type(data) == "dragen": assert bam.is_bam( fastq1), f"umi_type: dragen needs a BAM file as input." data = dragen.fix_umi_dragen_bam(data, bam=fastq1) # fastq1 = bam.sort(fastq1, dd.get_config(data)) # bam.index(fastq1, dd.get_config(data)) # data["work_bam"] = fastq1 else: logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner)) data = align_to_sort_bam(fastq1, fastq2, aligner, data) if dd.get_correct_umis(data): data["work_bam"] = postalign.correct_umis(data) if dd.get_umi_consensus(data): data["umi_bam"] = dd.get_work_bam(data) if fastq2 or dd.get_umi_type(data) == "dragen": f1, f2, avg_cov = postalign.umi_consensus(data) data["config"]["algorithm"]["rawumi_avg_cov"] = avg_cov del data["config"]["algorithm"]["umi_type"] data["config"]["algorithm"]["mark_duplicates"] = False data = align_to_sort_bam(f1, f2, aligner, data) else: raise ValueError( "Single fastq input for UMI processing; fgbio needs paired reads: %s" % dd.get_sample_name(data)) data = _add_supplemental_bams(data) elif fastq1 and objectstore.file_exists_or_remote( fastq1) and fastq1.endswith(".bam"): sort_method = config["algorithm"].get("bam_sort") bamclean = config["algorithm"].get("bam_clean") if bamclean is True or bamclean == "picard": if sort_method and sort_method != "coordinate": raise ValueError( "Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s" % sort_method) ref_file = dd.get_ref_file(data) out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], ref_file, data["dirs"], data) elif bamclean == "fixrg": out_bam = cleanbam.fixrg(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"], data) elif bamclean == "remove_extracontigs": out_bam = cleanbam.remove_extracontigs(fastq1, data) elif sort_method: runner = broad.runner_from_path("picard", config) out_file = os.path.join( data["dirs"]["work"], "{}-sort.bam".format( os.path.splitext(os.path.basename(fastq1))[0])) if not utils.file_exists(out_file): work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "bamclean", dd.get_sample_name(data))) out_file = os.path.join( work_dir, "{}-sort.bam".format(dd.get_sample_name(data))) out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file) else: out_bam = _link_bam_file( fastq1, os.path.join(dd.get_work_dir(data), "prealign", dd.get_sample_name(data)), data) bam.index(out_bam, data["config"]) bam.check_header(out_bam, data["rgnames"], dd.get_ref_file(data), data["config"]) dedup_bam = postalign.dedup_bam(out_bam, data) bam.index(dedup_bam, data["config"]) data["work_bam"] = dedup_bam elif fastq1 and objectstore.file_exists_or_remote( fastq1) and fastq1.endswith(".cram"): data["work_bam"] = fastq1 elif fastq1 is None and not dd.get_aligner(data): data["config"]["algorithm"]["variantcaller"] = False data["work_bam"] = None elif not fastq1: raise ValueError("No 'files' specified for input sample: %s" % dd.get_sample_name(data)) elif "kraken" in config["algorithm"]: # kraken doesn's need bam pass else: raise ValueError( "Could not process input file from sample configuration. \n" + fastq1 + "\nIs the path to the file correct or is empty?\n" + "If it is a fastq file (not pre-aligned BAM or CRAM), " "is an aligner specified in the input configuration?") if data.get("work_bam"): # Add stable 'align_bam' target to use for retrieving raw alignment data["align_bam"] = data["work_bam"] data = _add_hla_files(data) return [[data]]
def umi_transform(data): """ transform each read by identifying the barcode and UMI for each read and putting the information in the read name """ fqfiles = data["files"] fqfiles.extend(list(repeat("", 4 - len(fqfiles)))) fq1, fq2, fq3, fq4 = fqfiles umi_dir = os.path.join(dd.get_work_dir(data), "umis") safe_makedir(umi_dir) transform = dd.get_umi_type(data) if not transform: logger.info( "No UMI transform specified, assuming pre-transformed data.") if is_transformed(fq1): logger.info( "%s detected as pre-transformed, passing it on unchanged." % fq1) data["files"] = [fq1] return [[data]] else: logger.error( "No UMI transform was specified, but %s does not look " "pre-transformed." % fq1) sys.exit(1) if file_exists(transform): transform_file = transform else: transform_file = get_transform_file(transform) if not file_exists(transform_file): logger.error( "The UMI transform can be specified as either a file or a " "bcbio-supported transform. Either the file %s does not exist " "or the transform is not supported by bcbio. Supported " "transforms are %s." % (dd.get_umi_type(data), ", ".join(SUPPORTED_TRANSFORMS))) sys.exit(1) out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] cellular_barcodes = get_cellular_barcodes(data) if len(cellular_barcodes) > 1: split_option = "--separate_cb" else: split_option = "" if dd.get_demultiplexed(data): demuxed_option = "--demuxed_cb %s" % dd.get_sample_name(data) split_option = "" else: demuxed_option = "" cores = dd.get_num_cores(data) # skip transformation if the file already looks transformed with open_fastq(fq1) as in_handle: read = next(in_handle) if "UMI_" in read: data["files"] = [out_file] return [[data]] locale_export = utils.locale_export() umis = _umis_cmd(data) cmd = ("{umis} fastqtransform {split_option} {transform_file} " "--cores {cores} {demuxed_option} " "{fq1} {fq2} {fq3} {fq4}" "| seqtk seq -L 20 - | gzip > {tx_out_file}") message = ( "Inserting UMI and barcode information into the read name of %s" % fq1) with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]