def align(fastq_file, pair_file, ref_file, names, align_dir, data): paired = True if pair_file else False hisat2 = config_utils.get_program("hisat2", data) num_cores = dd.get_num_cores(data) quality_flag = _get_quality_flag(data) stranded_flag = _get_stranded_flag(data, paired) rg_flags = _get_rg_flags(names) out_file = os.path.join(align_dir, dd.get_lane(data)) + ".bam" if file_exists(out_file): data = dd.set_work_bam(data, out_file) return data cmd = ("{hisat2} -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} " "{rg_flags} ") if paired: cmd += "-1 {fastq_file} -2 {pair_file} " else: cmd += "-U {fastq_file} " if dd.get_analysis(data).lower() == "smallrna-seq": cmd += "-k 1000 " # if assembling transcripts, set flags that cufflinks can use if dd.get_assemble_transcripts(data): cmd += "--dta-cufflinks " if dd.get_analysis(data) == "rna-seq": splicesites = os.path.join(os.path.dirname(gtf_file), "ref-transcripts-splicesites.txt") cmd += "--known-splicesite-infile {splicesites} " message = "Aligning %s and %s with hisat2." %(fastq_file, pair_file) with file_transaction(out_file) as tx_out_file: cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_out_file) do.run(cmd.format(**locals()), message) data = dd.set_work_bam(data, out_file) return data
def align(fastq_file, pair_file, ref_file, names, align_dir, data): paired = True if pair_file else False hisat2 = config_utils.get_program("hisat2", data) num_cores = dd.get_num_cores(data) quality_flag = _get_quality_flag(data) stranded_flag = _get_stranded_flag(data, paired) rg_flags = _get_rg_flags(names) out_file = os.path.join(align_dir, dd.get_lane(data)) + ".bam" if file_exists(out_file): data = dd.set_work_bam(data, out_file) return data cmd = ( "{hisat2} -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} " "{rg_flags} ") if paired: cmd += "-1 {fastq_file} -2 {pair_file} " else: cmd += "-U {fastq_file} " if dd.get_analysis(data).lower() == "smallrna-seq": cmd += "-k 1000 " # if assembling transcripts, set flags that cufflinks/stringtie can use if dd.get_transcript_assembler(data): cmd += "--dta-cufflinks " if dd.get_analysis(data).lower() == "rna-seq": gtf_file = dd.get_gtf_file(data) splicesites = os.path.join(os.path.dirname(gtf_file), "ref-transcripts-splicesites.txt") cmd += "--known-splicesite-infile {splicesites} " message = "Aligning %s and %s with hisat2." % (fastq_file, pair_file) with file_transaction(out_file) as tx_out_file: cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_out_file) do.run(cmd.format(**locals()), message) data = dd.set_work_bam(data, out_file) return data
def align(fastq_file, pair_file, ref_file, names, align_dir, data): paired = True if pair_file else False hisat2 = config_utils.get_program("hisat2", data) num_cores = dd.get_num_cores(data) quality_flag = _get_quality_flag(data) stranded_flag = _get_stranded_flag(data, paired) rg_flags = _get_rg_flags(names) out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls( fastq_file, pair_file, data) else: final_file = None if not file_exists(out_file) and (final_file is None or not file_exists(final_file)): cmd = ( "{hisat2} --new-summary -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} " "{rg_flags} ") if paired: cmd += "-1 {fastq_file} -2 {pair_file} " else: cmd += "-U {fastq_file} " if dd.get_analysis(data).lower() == "smallrna-seq": cmd += "-k 1000 " # if assembling transcripts, set flags that cufflinks/stringtie can use if dd.get_transcript_assembler(data): cmd += "--dta-cufflinks " if dd.get_analysis(data).lower() == "rna-seq": splicesites = get_known_splicesites_file(align_dir, data) if file_exists(splicesites): cmd += "--known-splicesite-infile {splicesites} " novel_splicesite_file = os.path.join( align_dir, "{0}-novelsplicesites.bed".format(dd.get_sample_name(data))) cmd += "--novel-splicesite-outfile {novel_splicesite_file} " # apply additional hisat2 options cmd += " ".join(_get_options_from_config(data)) message = "Aligning %s and %s with hisat2." % (fastq_file, pair_file) with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file): cmd += " | " + tobam_cl do.run(cmd.format(**locals()), message) data = dd.set_work_bam(data, out_file) junctionbed = get_splicejunction_file(align_dir, data) data = dd.set_junction_bed(data, junctionbed) return data
def _maybe_add_sailfish_files(algorithm, sample, out): analysis = dd.get_analysis(sample) if dd.get_sailfish_dir(sample) and analysis != "fastrna-seq": out.append({"path": dd.get_sailfish_dir(sample), "type": "directory", "ext": "sailfish"}) return out
def counts_spikein(data): data = utils.to_single_data(data) samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "spikein", samplename) fasta_file = dd.get_spikein_fasta(data) if not fasta_file: return data files = dd.get_input_sequence_files(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file readlength = fastq.estimate_read_length(fq1) if readlength % 2 == 0: readlength -= 1 kmersize = min(readlength, 31) logger.info("kmersize used for salmon index at spikein quant: %s" % kmersize) kmersize = kmersize if not dd.get_analysis( data).lower() == "smallrna-seq" else 15 fasta_index = _index_spikein(fasta_file, salmon_dir, data, kmersize) out_file = _salmon_quant_reads(fq1, fq2, salmon_dir, fasta_index, data) data = dd.set_spikein_counts(data, out_file) return data
def _use_spark(num_cores, gatk_type, items, opts): data = items[0] use_spark = False if dd.get_analysis(data).lower() != "rna-seq": use_spark = (len(items) == 1 and num_cores > 1 and gatk_type == "gatk4") or "--spark-master" in opts return use_spark
def postprocess_variants(items): """Provide post-processing of variant calls: filtering and effects annotation. """ vrn_key = "vrn_file" if not isinstance(items, dict): items = [utils.to_single_data(x) for x in items] if "vrn_file_joint" in items[0]: vrn_key = "vrn_file_joint" data, items = _get_batch_representative(items, vrn_key) items = cwlutils.unpack_tarballs(items, data) data = cwlutils.unpack_tarballs(data, data) cur_name = "%s, %s" % (dd.get_sample_name(data), get_variantcaller(data)) logger.info("Finalizing variant calls: %s" % cur_name) orig_vrn_file = data.get(vrn_key) data = _symlink_to_workdir(data, [vrn_key]) data = _symlink_to_workdir(data, ["config", "algorithm", "variant_regions"]) if data.get(vrn_key): logger.info("Calculating variation effects for %s" % cur_name) ann_vrn_file, vrn_stats = effects.add_to_vcf(data[vrn_key], data) if ann_vrn_file: data[vrn_key] = ann_vrn_file if vrn_stats: data["vrn_stats"] = vrn_stats orig_items = _get_orig_items(items) logger.info("Annotate VCF file: %s" % cur_name) data[vrn_key] = annotation.finalize_vcf(data[vrn_key], get_variantcaller(data), orig_items) if dd.get_analysis(data).lower().find("rna-seq") >= 0: logger.info("Annotate RNA editing sites") ann_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"], data) if ann_file: data[vrn_key] = ann_file if cwlutils.is_cwl_run(data): logger.info("Annotate with population level variation data") ann_file = population.run_vcfanno(dd.get_vrn_file(data), data, population.do_db_build([data])) if ann_file: data[vrn_key] = ann_file logger.info("Filtering for %s" % cur_name) data[vrn_key] = variant_filtration( data[vrn_key], dd.get_ref_file(data), tz.get_in(("genome_resources", "variation"), data, {}), data, orig_items) logger.info("Prioritization for %s" % cur_name) prio_vrn_file = prioritize.handle_vcf_calls(data[vrn_key], data, orig_items) if prio_vrn_file != data[vrn_key]: data[vrn_key] = prio_vrn_file logger.info("Germline extraction for %s" % cur_name) data = germline.extract(data, orig_items) if dd.get_align_bam(data): data = damage.run_filter(data[vrn_key], dd.get_align_bam(data), dd.get_ref_file(data), data, orig_items) if orig_vrn_file and os.path.samefile(data[vrn_key], orig_vrn_file): data[vrn_key] = orig_vrn_file return [[data]]
def _maybe_add_sailfish_files(algorithm, sample, out): analysis = dd.get_analysis(sample) sailfish_dir = os.path.join(dd.get_work_dir(sample), "sailfish", dd.get_sample_name(sample), "quant") if os.path.exists(sailfish_dir): out.append({"path": dd.get_sailfish_dir(sample), "type": "directory", "ext": "sailfish"}) return out
def _maybe_add_sailfish_files(algorithm, sample, out): analysis = dd.get_analysis(sample) sailfish_dir = os.path.join(dd.get_work_dir(sample), "sailfish", dd.get_sample_name(sample), "quant") if os.path.exists(sailfish_dir): out.append({"path": sailfish_dir, "type": "directory", "ext": "sailfish"}) return out
def align(fastq_file, pair_file, ref_file, names, align_dir, data): paired = True if pair_file else False hisat2 = config_utils.get_program("hisat2", data) num_cores = dd.get_num_cores(data) quality_flag = _get_quality_flag(data) stranded_flag = _get_stranded_flag(data, paired) rg_flags = _get_rg_flags(names) out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data) else: final_file = None if not file_exists(out_file) and (final_file is None or not file_exists(final_file)): cmd = ("{hisat2} --new-summary -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} " "{rg_flags} ") if paired: cmd += "-1 {fastq_file} -2 {pair_file} " else: cmd += "-U {fastq_file} " if dd.get_analysis(data).lower() == "smallrna-seq": cmd += "-k 1000 " # if assembling transcripts, set flags that cufflinks/stringtie can use if dd.get_transcript_assembler(data): cmd += "--dta-cufflinks " if dd.get_analysis(data).lower() == "rna-seq": splicesites = get_known_splicesites_file(align_dir, data) if file_exists(splicesites): cmd += "--known-splicesite-infile {splicesites} " novel_splicesite_file = os.path.join(align_dir, "{0}-novelsplicesites.bed".format(dd.get_sample_name(data))) cmd += "--novel-splicesite-outfile {novel_splicesite_file} " # apply additional hisat2 options cmd += " ".join(_get_options_from_config(data)) message = "Aligning %s and %s with hisat2." % (fastq_file, pair_file) with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file): cmd += " | " + tobam_cl do.run(cmd.format(**locals()), message) data = dd.set_work_bam(data, out_file) junctionbed = get_splicejunction_file(align_dir, data) data = dd.set_junction_bed(data, junctionbed) return data
def _default_conf_files(data, retriever): conf_files = [] if dd.get_variantcaller(data) or dd.get_vrn_file(data): if annotate_gemini(data, retriever): conf_files.append("gemini") if _annotate_somatic(data, retriever): conf_files.append("somatic") if dd.get_analysis(data).lower().find("rna-seq") >= 0: conf_files.append("rnaedit") return conf_files
def _check_dedup(data): """Check configuration for de-duplication. Defaults to no de-duplication for RNA-seq and small RNA, the back compatible default. Allow overwriting with explicit `mark_duplicates: true` setting. """ if dd.get_analysis(data).lower() in ["rna-seq", "smallrna-seq"]: dup_param = utils.get_in(data, ("config", "algorithm", "mark_duplicates"), False) else: dup_param = utils.get_in(data, ("config", "algorithm", "mark_duplicates"), True) if dup_param and isinstance(dup_param, basestring): logger.info("Warning: bcbio no longer support explicit setting of mark_duplicate algorithm. " "Using best-practice choice based on input data.") dup_param = True return dup_param
def _check_dedup(data): """Check configuration for de-duplication. Defaults to no de-duplication for RNA-seq and small RNA, the back compatible default. Allow overwriting with explicit `mark_duplicates: true` setting. Also defaults to false for no alignment inputs. """ if dd.get_analysis(data).lower() in ["rna-seq", "smallrna-seq"] or not dd.get_aligner(data): dup_param = utils.get_in(data, ("config", "algorithm", "mark_duplicates"), False) else: dup_param = utils.get_in(data, ("config", "algorithm", "mark_duplicates"), True) if dup_param and isinstance(dup_param, six.string_types): logger.info("Warning: bcbio no longer support explicit setting of mark_duplicate algorithm. " "Using best-practice choice based on input data.") dup_param = True return dup_param
def counts_spikein(data): data = utils.to_single_data(data) samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "spikein", samplename) fasta_file = dd.get_spikein_fasta(data) if not fasta_file: return data files = dd.get_input_sequence_files(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file kmer = 31 if not dd.get_analysis(data).lower() == "smallrna-seq" else 15 fasta_index = _index_spikein(fasta_file, salmon_dir, data, kmer) out_file = _salmon_quant_reads(fq1, fq2, salmon_dir, fasta_index, data) data = dd.set_spikein_counts(data, out_file) return data
def variant_filtration(call_file, ref_file, vrn_files, data, items): """Filter variant calls using Variant Quality Score Recalibration. Newer GATK with Haplotype calling has combined SNP/indel filtering. """ caller = data["config"]["algorithm"].get("variantcaller") if "gvcf" not in dd.get_tools_on(data): call_file = ploidy.filter_vcf_by_sex(call_file, items) if caller in ["freebayes"]: return vfilter.freebayes(call_file, ref_file, vrn_files, data) elif caller in ["platypus"]: return vfilter.platypus(call_file, data) elif caller in ["samtools"]: return vfilter.samtools(call_file, data) elif caller in ["gatk", "gatk-haplotype", "haplotyper"]: if dd.get_analysis(data).lower().find("rna-seq") >= 0: from bcbio.rnaseq import variation as rnaseq_variation return rnaseq_variation.gatk_filter_rnaseq(call_file, data) else: return gatkfilter.run(call_file, ref_file, vrn_files, data) # no additional filtration for callers that filter as part of call process else: return call_file
def counts_spikein(data): data = utils.to_single_data(data) samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "spikein", samplename) fasta_file = dd.get_spikein_fasta(data) if not fasta_file: return data files = dd.get_input_sequence_files(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file readlength = fastq.estimate_read_length(fq1) if readlength % 2 == 0: readlength -= 1 kmersize = min(readlength, 31) logger.info("kmersize used for salmon index at spikein quant: %s" % kmersize) kmersize = kmersize if not dd.get_analysis(data).lower() == "smallrna-seq" else 15 fasta_index = _index_spikein(fasta_file, salmon_dir, data, kmersize) out_file = _salmon_quant_reads(fq1, fq2, salmon_dir, fasta_index, data) data = dd.set_spikein_counts(data, out_file) return data