def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) report_file = os.path.join(out_dir, "qualimapReport.html") pdf_file = "qualimapReport.pdf" if not utils.file_exists(report_file) and not utils.file_exists(os.path.join(out_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) utils.safe_makedir(out_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) cmd = ("unset DISPLAY && {qualimap} bamqc -bam {bam_file} -outdir {out_dir} " "-nt {num_cores} --java-mem-size={max_mem} {options}") species = tz.get_in(("genome_resources", "aliases", "ensembl"), data, "") if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data) if regions: bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data)) return _parse_qualimap_metrics(report_file)
def organize(dirs, config, run_info_yaml): """Organize run information from a passed YAML file or the Galaxy API. Creates the high level structure used for subsequent processing. """ if run_info_yaml and os.path.exists(run_info_yaml): logger.info("Using input YAML configuration: %s" % run_info_yaml) run_details = _run_info_from_yaml(dirs["flowcell"], run_info_yaml, config) else: logger.info("Fetching run details from Galaxy instance") fc_name, fc_date = flowcell.parse_dirname(dirs["flowcell"]) galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) run_details = [] galaxy_info = galaxy_api.run_details(fc_name, fc_date) for item in galaxy_info["details"]: item["upload"] = {"method": "galaxy", "run_id": galaxy_info["run_id"], "fc_name": fc_name, "fc_date": fc_date} run_details.append(item) out = [] for item in run_details: # add algorithm details to configuration, avoid double specification item["config"] = config_utils.update_w_custom(config, item) item.pop("algorithm", None) item["dirs"] = dirs if "name" not in item: item["name"] = ["", item["description"]] item = add_reference_resources(item) out.append(item) return out
def organize(dirs, config, run_info_yaml): """Organize run information from a passed YAML file or the Galaxy API. Creates the high level structure used for subsequent processing. """ logger.info("Using input YAML configuration: %s" % run_info_yaml) assert run_info_yaml and os.path.exists(run_info_yaml), \ "Did not find input sample YAML file: %s" % run_info_yaml run_details = _run_info_from_yaml(dirs["flowcell"], run_info_yaml, config) out = [] for item in run_details: # add algorithm details to configuration, avoid double specification item["config"] = config_utils.update_w_custom(config, item) item.pop("algorithm", None) item["dirs"] = dirs if "name" not in item: item["name"] = ["", item["description"]] elif isinstance(item["name"], basestring): description = "%s-%s" % (item["name"], clean_name(item["description"])) item["name"] = [item["name"], description] item["description"] = description item = add_reference_resources(item) # Create temporary directories and make absolute if utils.get_in(item, ("config", "resources", "tmp", "dir")): utils.safe_makedir(utils.get_in(item, ("config", "resources", "tmp", "dir"))) item["config"]["resources"]["tmp"] = genome.abs_file_paths( utils.get_in(item, ("config", "resources", "tmp"))) out.append(item) return out
def run_prepare(*data): """ Run seqcluster prepare to merge all samples in one file """ out_dir = os.path.join(dd.get_work_dir(data[0][0]), "seqcluster", "prepare") out_dir = os.path.abspath(safe_makedir(out_dir)) prepare_dir = os.path.join(out_dir, "prepare") tools = dd.get_expression_caller(data[0][0]) if len(tools) == 0: logger.info("You didn't specify any other expression caller tool." "You can add to the YAML file:" "expression_caller:[trna, seqcluster, mirdeep2]") fn = [] for sample in data: name = sample[0]["rgnames"]['sample'] fn.append("%s\t%s" % (sample[0]['collapse'], name)) args = namedtuple('args', 'debug print_debug minc minl maxl out') args = args(False, False, 2, 17, 40, out_dir) ma_out = op.join(out_dir, "seqs.ma") seq_out = op.join(out_dir, "seqs.fastq") min_shared = max(int(len(fn) / 10.0), 1) if not file_exists(ma_out): seq_l, sample_l = prepare._read_fastq_files(fn, args) with file_transaction(ma_out) as ma_tx: with open(ma_tx, 'w') as ma_handle: with open(seq_out, 'w') as seq_handle: prepare._create_matrix_uniq_seq(sample_l, seq_l, ma_handle, seq_handle, min_shared) for sample in data: sample[0]["seqcluster_prepare_ma"] = ma_out sample[0]["seqcluster_prepare_fastq"] = seq_out return data
def run_memory_retry(cmd, descr, data=None, check=None, region=None): """Run command, retrying when detecting fail due to memory errors. This is useful for high throughput Java jobs which fail intermittently due to an inability to get system resources. """ max_runs = 5 num_runs = 0 while 1: try: run(cmd, descr, data, check, region=region, log_error=False) break except subprocess.CalledProcessError, msg: if num_runs < max_runs and ("insufficient memory" in str(msg) or "did not provide enough memory" in str(msg) or "A fatal error has been detected" in str(msg) or "java.lang.OutOfMemoryError" in str(msg) or "Resource temporarily unavailable" in str(msg)): logger.info("Retrying job. Memory or resource issue with run: %s" % _descr_str(descr, data, region)) time.sleep(30) num_runs += 1 else: logger.exception() raise
def genebody_coverage2(in_file, config, out_prefix=None): """ used to check the 5'/3' bias across transcripts, takes a bam file, converts it to bigwig and then uses that """ PROGRAM = "geneBody_coverage2.py" if not program_exists(PROGRAM): logger.info("%s is not in the path or is not executable." % (PROGRAM)) exit(1) in_bigwig = bam2bigwig(in_file, config) prefix = "coverage" out_dir = os.path.join(os.path.dirname(in_bigwig), os.pardir, "coverage") safe_makedir(out_dir) out_prefix = out_dir + "/wiggle" #out_prefix = _get_out_prefix(in_bigwig, config, out_prefix, prefix) coverage_plot_file = out_prefix + ".geneBodyCoverage.pdf" if file_exists(coverage_plot_file): return coverage_plot_file gtf = _get_gtf(config) bed = _gtf2bed(gtf) coverage_run = sh.Command(which(PROGRAM)) cmd = str(coverage_run.bake(i=in_bigwig, r=bed, o=out_prefix, t="pdf")) do.run(cmd, "Calculating coverage of %s." % (in_bigwig), None) return coverage_plot_file
def filter_reads_by_length(fq1, fq2, quality_format, min_length=20): """ removes reads from a pair of fastq files that are shorter than a minimum length. removes both ends of a read if one end falls below the threshold while maintaining the order of the reads """ logger.info("Removing reads in %s and %s that " "are less than %d bases." % (fq1, fq2, min_length)) fq1_out = utils.append_stem(fq1, ".fixed") fq2_out = utils.append_stem(fq2, ".fixed") fq1_single = utils.append_stem(fq1, ".singles") fq2_single = utils.append_stem(fq2, ".singles") if all(map(utils.file_exists, [fq1_out, fq2_out, fq2_single, fq2_single])): return [fq1_out, fq2_out] fq1_in = SeqIO.parse(fq1, quality_format) fq2_in = SeqIO.parse(fq2, quality_format) with open(fq1_out, 'w') as fq1_out_handle, open(fq2_out, 'w') as fq2_out_handle, open(fq1_single, 'w') as fq1_single_handle, open(fq2_single, 'w') as fq2_single_handle: for fq1_record, fq2_record in izip(fq1_in, fq2_in): if len(fq1_record.seq) >= min_length and len(fq2_record.seq) >= min_length: fq1_out_handle.write(fq1_record.format(quality_format)) fq2_out_handle.write(fq2_record.format(quality_format)) else: if len(fq1_record.seq) > min_length: fq1_single_handle.write(fq1_record.format(quality_format)) if len(fq2_record.seq) > min_length: fq2_single_handle.write(fq2_record.format(quality_format)) return [fq1_out, fq2_out]
def detect_fastq_format(in_file, MAX_RECORDS=1000000): """ detects the format of a fastq file will return multiple formats if it could be more than one """ logger.info("Detecting FASTQ format on %s." % (in_file)) kept = list(_FASTQ_RANGES.keys()) with open(in_file) as in_handle: records_read = 0 for i, line in enumerate(in_handle): # get the quality line if records_read >= MAX_RECORDS: break if i % 4 is 3: records_read += 1 for c in line: formats = kept if len(formats) == 1: return formats for form in formats: if (_FASTQ_RANGES[form][0] > ord(c) or _FASTQ_RANGES[form][1] < ord(c)): kept.remove(form) return formats
def _merge_metrics(samples, out_dir): """Merge metrics from multiple QC steps """ logger.info("summarize metrics") out_dir = utils.safe_makedir(os.path.join(out_dir, "report", "metrics")) sample_metrics = collections.defaultdict(dict) for s in samples: s = _add_disambiguate(s) m = tz.get_in(['summary', 'metrics'], s) if isinstance(m, six.string_types): m = json.loads(m) if m: for me in m.keys(): if isinstance(m[me], list) or isinstance(m[me], dict) or isinstance(m[me], tuple): m.pop(me, None) sample_metrics[dd.get_sample_name(s)].update(m) out = [] for sample_name, m in sample_metrics.items(): sample_file = os.path.join(out_dir, "%s_bcbio.txt" % sample_name) with file_transaction(samples[0], sample_file) as tx_out_file: dt = pd.DataFrame(m, index=['1']) dt.columns = [k.replace(" ", "_").replace("(", "").replace(")", "") for k in dt.columns] dt['sample'] = sample_name dt['rRNA_rate'] = m.get('rRNA_rate', "NA") dt['RiP_pct'] = "%.3f" % (int(m.get("RiP", 0)) / float(m.get("Total_reads", 1)) * 100) dt = _fix_duplicated_rate(dt) dt.transpose().to_csv(tx_out_file, sep="\t", header=False) out.append(sample_file) return out
def process_lane(lane_items, fc_name, fc_date, dirs, config): """Prepare lanes, potentially splitting based on barcodes. """ lane_name = "%s_%s_%s" % (lane_items[0]['lane'], fc_date, fc_name) logger.info("Demulitplexing %s" % lane_name) full_fastq1, full_fastq2 = get_fastq_files(dirs["fastq"], dirs["work"], lane_items[0], fc_name, config=config) bc_files = split_by_barcode(full_fastq1, full_fastq2, lane_items, lane_name, dirs, config) out = [] for item in lane_items: config = _update_config_w_custom(config, item) # Can specify all barcodes but might not have actual sequences # Would be nice to have a good way to check this is okay here. if bc_files.has_key(item["barcode_id"]): for fastq1, fastq2, lane_ext in _prep_fastq_files(item, bc_files, dirs, config): cur_lane_name = lane_name cur_lane_desc = item["description"] if item.get("name", "") and config["algorithm"].get("include_short_name", True): cur_lane_desc = "%s : %s" % (item["name"], cur_lane_desc) if item["barcode_id"] is not None: cur_lane_name += "_%s" % (item["barcode_id"]) if lane_ext is not None: cur_lane_name += "_s{0}".format(lane_ext) if config["algorithm"].get("trim_reads", False): trim_info = brun_trim_fastq([x for x in [fastq1, fastq2] if x is not None], dirs, config) fastq1 = trim_info[0] if fastq2 is not None: fastq2 = trim_info[1] out.append((fastq1, fastq2, item, cur_lane_name, cur_lane_desc, dirs, config)) return out
def combine_calls(batch_id, samples, data): """Combine multiple callsets into a final set of merged calls. """ logger.info("Ensemble consensus calls for {0}: {1}".format( batch_id, ",".join(x["variantcaller"] for x in samples[0]["variants"]))) edata = copy.deepcopy(data) base_dir = utils.safe_makedir(os.path.join(edata["dirs"]["work"], "ensemble", batch_id)) caller_names, vrn_files, bam_files = _organize_variants(samples, batch_id) exist_variants = False for tmp_vrn_file in vrn_files: if vcfutils.vcf_has_variants(tmp_vrn_file): exist_variants = True break if exist_variants: if "classifiers" not in edata["config"]["algorithm"]["ensemble"]: callinfo = _run_ensemble_intersection(batch_id, vrn_files, base_dir, edata) else: config_file = _write_config_file(batch_id, caller_names, base_dir, edata) callinfo = _run_ensemble(batch_id, vrn_files, config_file, base_dir, edata["sam_ref"], edata) edata["config"]["algorithm"]["variantcaller"] = "ensemble" edata["vrn_file"] = callinfo["vrn_file"] edata["ensemble_bed"] = callinfo["bed_file"] callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get("validate") else: out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id)) vcfutils.write_empty_vcf(out_vcf_file) callinfo = {"variantcaller": "ensemble", "vrn_file": out_vcf_file, "bed_file": None} return [[batch_id, callinfo]]
def run(self, config, run_info_yaml, parallel, dirs, samples): ## Alignment and preparation requiring the entire input file (multicore cluster) with prun.start(_wres(parallel, ["aligner"]), samples, config, dirs, "multicore") as run_parallel: with profile.report("organize samples", dirs): samples = run_parallel( "organize_samples", [[dirs, config, run_info_yaml, [x[0]["description"] for x in samples]]] ) with profile.report("alignment", dirs): samples = run_parallel("process_alignment", samples) with profile.report("callable regions", dirs): samples = run_parallel("prep_samples", [samples]) samples = run_parallel("postprocess_alignment", samples) samples = run_parallel("combine_sample_regions", [samples]) samples = region.clean_sample_data(samples) ## Quality control with prun.start( _wres(parallel, ["fastqc", "bamtools", "samtools", "qsignature", "kraken"]), samples, config, dirs, "multicore2", ) as run_parallel: with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) with profile.report("upload", dirs): for sample in samples: run_parallel("upload_samples", [sample]) logger.info("Timing: finished") return samples
def _variant_filtration_indel(snp_file, ref_file, vrn_files, config): """Filter indel variant calls using GATK best practice recommendations. """ broad_runner = broad.runner_from_config(config) filter_type = "INDEL" variantcaller = config["algorithm"].get("variantcaller", "gatk") if not config_utils.use_vqsr([config["algorithm"]]): return vfilter.jexl_hard(broad_runner, snp_file, ref_file, filter_type, ["QD < 2.0", "ReadPosRankSum < -20.0", "FS > 200.0"]) else: # also check if we've failed recal and needed to do strict filtering filter_file = "{base}-filter{ext}.vcf".format(base=os.path.splitext(snp_file)[0], ext=filter_type) if file_exists(filter_file): config["algorithm"]["coverage_interval"] = "regional" return _variant_filtration_indel(snp_file, ref_file, vrn_files, config) assert "train_indels" in vrn_files, "Need indel training file specified" params, recal_file, tranches_file = _shared_variant_filtration( filter_type, snp_file, ref_file, vrn_files, variantcaller) if not file_exists(recal_file): with file_transaction(recal_file, tranches_file) as (tx_recal, tx_tranches): params.extend(["--recal_file", tx_recal, "--tranches_file", tx_tranches]) if LooseVersion(broad_runner.get_gatk_version()) >= LooseVersion("2.7"): params.extend(["--numBadVariants", "3000"]) try: broad_runner.new_resources("gatk-vqsr") broad_runner.run_gatk(params, log_error=False) except: logger.info("VQSR failed due to lack of training data. Using hard filtering.") config["algorithm"]["coverage_interval"] = "regional" return _variant_filtration_indel(snp_file, ref_file, vrn_files, config) return _apply_variant_recal(broad_runner, snp_file, ref_file, recal_file, tranches_file, filter_type)
def align_transcriptome(fastq_file, pair_file, ref_file, data): """ bwa mem with settings for aligning to the transcriptome for eXpress/RSEM/etc """ work_bam = dd.get_work_bam(data) base, ext = os.path.splitext(work_bam) out_file = base + ".transcriptome" + ext if utils.file_exists(out_file): data = dd.set_transcriptome_bam(data, out_file) return data # bwa mem needs phred+33 quality, so convert if it is Illumina if dd.get_quality_format(data).lower() == "illumina": logger.info("bwa mem does not support the phred+64 quality format, " "converting %s and %s to phred+33.") fastq_file = fastq.groom(fastq_file, in_qual="fastq-illumina", data=data) if pair_file: pair_file = fastq.groom(pair_file, in_qual="fastq-illumina", data=data) bwa = config_utils.get_program("bwa", data["config"]) gtf_file = dd.get_gtf_file(data) gtf_fasta = index_transcriptome(gtf_file, ref_file, data) args = " ".join(_bwa_args_from_config(data["config"])) num_cores = data["config"]["algorithm"].get("num_cores", 1) cmd = ( "{bwa} mem {args} -a -t {num_cores} {gtf_fasta} {fastq_file} " "{pair_file} | samtools view -bhS - > {tx_out_file}" ) with file_transaction(out_file) as tx_out_file: message = "Aligning %s and %s to the transcriptome." % (fastq_file, pair_file) do.run(cmd.format(**locals()), message) data = dd.set_transcriptome_bam(data, out_file) return data
def generate_transcript_counts(data): """Generate counts per transcript and per exon from an alignment""" data["count_file"] = featureCounts.count(data) if dd.get_fusion_mode(data, False): oncofuse_file = oncofuse.run(data) if oncofuse_file: data = dd.set_oncofuse_file(data, oncofuse_file) if dd.get_transcriptome_align(data) and not dd.get_transcriptome_bam(data): file1, file2 = None, None if dd.get_disambiguate(data): bam_path = data["work_bam"] fastq_paths = alignprep._bgzip_from_bam(bam_path, data["dirs"], data["config"], is_retry=False, output_infix='-transcriptome') if len(fastq_paths) == 2: file1, file2 = fastq_paths else: file1, file2 = fastq_paths[0], None else: file1, file2 = dd.get_input_sequence_files(data) ref_file = dd.get_ref_file(data) logger.info("Transcriptome alignment was flagged to run, but the " "transcriptome BAM file was not found. Aligning to the " "transcriptome with bowtie2.") data = bowtie2.align_transcriptome(file1, file2, ref_file, data) return [[data]]
def _mint_trna_annotation(data): """ use MINTmap to quantify tRNAs """ trna_lookup = op.join(dd.get_srna_mint_lookup(data)) trna_space = op.join(dd.get_srna_mint_space(data)) trna_other = op.join(dd.get_srna_mint_other(data)) name = dd.get_sample_name(data) work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "trna_mint", name)) in_file = op.basename(data["clean_fastq"]) mintmap = os.path.realpath(os.path.join(os.path.dirname(sys.executable), "MINTmap.pl")) perl_export = utils.get_perl_exports() if not file_exists(trna_lookup) or not file_exists(mintmap): logger.info("There is no tRNA annotation to run MINTmap.") return work_dir jar_folder = os.path.join(os.path.dirname(mintmap), "MINTplates") out_file = op.join(work_dir, name + "-MINTmap_v1-exclusive-tRFs.expression.txt") if not file_exists(out_file): with tx_tmpdir(data) as txdir: with utils.chdir(txdir): utils.symlink_plus(data["clean_fastq"], op.join(txdir, in_file)) cmd = ("{perl_export} && {mintmap} -f {in_file} -p {name} " "-l {trna_lookup} -s {trna_space} -j {jar_folder} " "-o {trna_other}").format(**locals()) do.run(cmd, "tRNA for %s" % name) for filename in glob.glob("*MINTmap*"): shutil.move(filename, work_dir) return work_dir
def copy_flowcell(dname, fastq_dir, sample_cfile, config): """Copy required files for processing using rsync, potentially to a remote server. """ with utils.chdir(dname): reports = reduce(operator.add, [glob.glob("*.xml"), glob.glob("Data/Intensities/BaseCalls/*.xml"), glob.glob("Data/Intensities/BaseCalls/*.xsl"), glob.glob("Data/Intensities/BaseCalls/*.htm"), ["Data/Intensities/BaseCalls/Plots", "Data/reports", "Data/Status.htm", "Data/Status_Files", "InterOp"]]) run_info = reduce(operator.add, [glob.glob("run_info.yaml"), glob.glob("*.csv")]) fastq = glob.glob(os.path.join(fastq_dir.replace(dname + "/", "", 1), "*.gz")) configs = [sample_cfile.replace(dname + "/", "", 1)] include_file = os.path.join(dname, "transfer_files.txt") with open(include_file, "w") as out_handle: out_handle.write("+ */\n") for fname in configs + fastq + run_info + reports: out_handle.write("+ %s\n" % fname) out_handle.write("- *\n") # remote transfer if utils.get_in(config, ("process", "host")): dest = "%s@%s:%s" % (utils.get_in(config, ("process", "username")), utils.get_in(config, ("process", "host")), utils.get_in(config, ("process", "dir"))) # local transfer else: dest = utils.get_in(config, ("process", "dir")) cmd = ["rsync", "-akmrtv", "--include-from=%s" % include_file, dname, dest] logger.info("Copying files to analysis machine") logger.info(" ".join(cmd)) subprocess.check_call(cmd)
def prep_recal(data): """Perform a GATK recalibration of the sorted aligned BAM, producing recalibrated BAM. """ if dd.get_recalibrate(data) in [True, "gatk"]: logger.info("Recalibrating %s with GATK" % str(dd.get_sample_name(data))) ref_file = data["sam_ref"] config = data["config"] dbsnp_file = tz.get_in(("genome_resources", "variation", "dbsnp"), data) if not dbsnp_file: logger.info("Skipping GATK BaseRecalibrator because no VCF file of known variants was found.") return [[data]] platform = config["algorithm"].get("platform", "illumina") broad_runner = broad.runner_from_path("picard", config) broad_runner.run_fn("picard_index_ref", ref_file) if config["algorithm"].get("mark_duplicates", True): (dup_align_bam, _) = broad_runner.run_fn("picard_mark_duplicates", data["work_bam"]) else: dup_align_bam = data["work_bam"] bam.index(dup_align_bam, config) intervals = config["algorithm"].get("variant_regions", None) data["work_bam"] = dup_align_bam broad_runner = broad.runner_from_config(config) data["prep_recal"] = _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform, dbsnp_file, intervals, data) return [[data]]
def _call_variants_samtools(align_bams, ref_file, items, target_regions, out_file): """Call variants with samtools in target_regions. Works around a GATK VCF compatibility issue in samtools 0.20 by removing extra Version information from VCF header lines. """ config = items[0]["config"] max_read_depth = "1000" mpileup = prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=target_regions) bcftools = config_utils.get_program("bcftools", config) bcftools_version = programs.get_version("bcftools", config=config) samtools_version = programs.get_version("samtools", config=config) if LooseVersion(bcftools_version) > LooseVersion("0.1.19"): if LooseVersion(samtools_version) <= LooseVersion("0.1.19"): raise ValueError("samtools calling not supported with 0.1.19 samtools and 0.20 bcftools") bcftools_opts = "call -v -c" else: bcftools_opts = "view -v -c -g" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" vcfutils = config_utils.get_program("vcfutils.pl", config) # XXX Check if we need this when supporting samtools 0.2.0 calling. # 0.1.9 fails on regions without reads. if not any(realign.has_aligned_reads(x, target_regions) for x in align_bams): vcfutils.write_empty_vcf(out_file, config) else: cmd = ("{mpileup} " "| {bcftools} {bcftools_opts} - " "| {vcfutils} varFilter -D {max_read_depth} " "| sed 's/,Version=3>/>/'" "{compress_cmd} > {out_file}") logger.info(cmd.format(**locals())) do.run(cmd.format(**locals()), "Variant calling with samtools", {})
def gatk_realigner_targets(runner, align_bam, ref_file, dbsnp=None, region=None, out_file=None, deep_coverage=False, variant_regions=None): """Generate a list of interval regions for realignment around indels. """ if out_file: out_file = "%s.intervals" % os.path.splitext(out_file)[0] else: out_file = "%s-realign.intervals" % os.path.splitext(align_bam)[0] # check only for file existence; interval files can be empty after running # on small chromosomes, so don't rerun in those cases if not os.path.exists(out_file): with file_transaction(out_file) as tx_out_file: logger.info("GATK RealignerTargetCreator: %s %s" % (os.path.basename(align_bam), region)) params = ["-T", "RealignerTargetCreator", "-I", align_bam, "-R", ref_file, "-o", tx_out_file, "-l", "INFO", ] region = subset_variant_regions(variant_regions, region, tx_out_file) if region: params += ["-L", region, "--interval_set_rule", "INTERSECTION"] if dbsnp: params += ["--known", dbsnp] if deep_coverage: params += ["--mismatchFraction", "0.30", "--maxIntervalSize", "650"] runner.run_gatk(params) return out_file
def gff3_to_gtf(gff3_file): dialect = {'field separator': '; ', 'fmt': 'gtf', 'keyval separator': ' ', 'leading semicolon': False, 'multival separator': ',', 'quoted GFF2 values': True, 'order': ['gene_id', 'transcript_id'], 'repeated keys': False, 'trailing semicolon': True} out_file = os.path.splitext(gff3_file)[0] + ".gtf" if file_exists(out_file): return out_file logger.info("Converting %s to %s." % (gff3_file, out_file)) if _is_from_ncbi(gff3_file): logger.info("NCBI format detected by the presence of the %s key." % _is_from_ncbi(gff3_file)) _output_ncbi_gff3(gff3_file, out_file, dialect) else: _output_gff3(gff3_file, out_file, dialect) return out_file
def align(fastq_file, pair_file, ref_file, out_base, align_dir, config, rg_name=None): """Perform a BWA alignment, generating a SAM file. """ sai1_file = os.path.join(align_dir, "%s_1.sai" % out_base) sai2_file = (os.path.join(align_dir, "%s_2.sai" % out_base) if pair_file else None) sam_file = os.path.join(align_dir, "%s.sam" % out_base) if not file_exists(sam_file): if not file_exists(sai1_file): with file_transaction(sai1_file) as tx_sai1_file: _run_bwa_align(fastq_file, ref_file, tx_sai1_file, config) if sai2_file and not file_exists(sai2_file): with file_transaction(sai2_file) as tx_sai2_file: _run_bwa_align(pair_file, ref_file, tx_sai2_file, config) align_type = "sampe" if sai2_file else "samse" sam_cl = [config["program"]["bwa"], align_type, ref_file, sai1_file] if sai2_file: sam_cl.append(sai2_file) sam_cl.append(fastq_file) if sai2_file: sam_cl.append(pair_file) with file_transaction(sam_file) as tx_sam_file: with open(tx_sam_file, "w") as out_handle: logger.info(" ".join(sam_cl)) subprocess.check_call(sam_cl, stdout=out_handle) return sam_file
def _analysis_block_stats(regions): """Provide statistics on sizes and number of analysis blocks. """ prev = None between_sizes = [] region_sizes = [] for region in regions: if prev and prev.chrom == region.chrom: between_sizes.append(region.start - prev.end) region_sizes.append(region.end - region.start) prev = region def descriptive_stats(xs): if len(xs) < 2: return xs parts = ["min: %s" % min(xs), "5%%: %s" % numpy.percentile(xs, 5), "25%%: %s" % numpy.percentile(xs, 25), "median: %s" % numpy.percentile(xs, 50), "75%%: %s" % numpy.percentile(xs, 75), "95%%: %s" % numpy.percentile(xs, 95), "99%%: %s" % numpy.percentile(xs, 99), "max: %s" % max(xs)] return "\n".join([" " + x for x in parts]) logger.info("Identified %s parallel analysis blocks\n" % len(region_sizes) + "Block sizes:\n%s\n" % descriptive_stats(region_sizes) + "Between block sizes:\n%s\n" % descriptive_stats(between_sizes)) if len(region_sizes) == 0: raise ValueError("No callable analysis regions found in all samples")
def illumina_qual_bin(in_file, ref_file, out_dir, config): """Uses CRAM to perform Illumina 8-bin approaches to existing BAM files. Bins quality scores according to Illumina scheme: http://www.illumina.com/Documents/products/whitepapers/whitepaper_datacompression.pdf Also fixes output header to remove extra run groups added by CRAM during conversion. """ index_file = ref_file + ".fai" assert os.path.exists(index_file), "Could not find FASTA reference index: %s" % index_file out_file = os.path.join(out_dir, "%s-qualbin%s" % os.path.splitext(os.path.basename(in_file))) cram_jar = config_utils.get_jar("cramtools", config_utils.get_program("cram", config, "dir")) samtools = config_utils.get_program("samtools", config) if not file_exists(out_file): with file_transaction(out_file) as tx_out_file: orig_header = "%s-header.sam" % os.path.splitext(out_file)[0] header_cmd = "{samtools} view -H -o {orig_header} {in_file}" cmd = ("java -jar {cram_jar} cram --input-bam-file {in_file} " " --reference-fasta-file {ref_file} --preserve-read-names " " --capture-all-tags --lossy-quality-score-spec '*8' " "| java -jar {cram_jar} bam --output-bam-format " " --reference-fasta-file {ref_file} " "| {samtools} reheader {orig_header} - " "> {tx_out_file}") logger.info("Quality binning with CRAM") subprocess.check_call(header_cmd.format(**locals()), shell=True) subprocess.check_call(cmd.format(**locals()), shell=True) return out_file
def run_freebayes(align_bam, ref_file, config, dbsnp=None, region=None, out_file=None): """Detect small polymorphisms with FreeBayes. """ if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bam)[0] if not file_exists(out_file): logger.info("Genotyping with FreeBayes: {region} {fname}".format( region=region, fname=os.path.basename(align_bam))) with file_transaction(out_file) as tx_out_file: cl = [config["program"].get("freebayes", "freebayes"), "-b", align_bam, "-v", tx_out_file, "-f", ref_file] if region: cl.extend(["-r", region]) try: subprocess.check_call(cl) # XXX Temporary, work around freebayes issue; need to recall these regions # later so this is an ugly silent fix. Will need to grep for 'freebayes failed' # https://github.com/ekg/freebayes/issues/22 except subprocess.CalledProcessError: with open(tx_out_file, "w") as out_handle: out_handle.write("##fileformat=VCFv4.1\n" "## No variants; freebayes failed\n" "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n") return out_file
def run(items): paired = vcfutils.get_paired(items) if not paired: logger.info("Skipping PureCN; no somatic tumor calls in batch: %s" % " ".join([dd.get_sample_name(d) for d in items])) return items work_dir = _sv_workdir(paired.tumor_data) purecn_out = _run_purecn(paired, work_dir) # XXX Currently finding edge case failures with Dx calling, needs additional testing # purecn_out = _run_purecn_dx(purecn_out, paired) out = [] if paired.normal_data: out.append(paired.normal_data) if purecn_out: purecn_out["variantcaller"] = "purecn" if "loh" in purecn_out: from bcbio.structural import titancna purecn_out["vrn_file"] = titancna.to_vcf(purecn_out["loh"], "PureCN", _get_header, _loh_to_vcf, paired.tumor_data, sep=",") purecn_out["lohsummary"] = loh.summary_status(purecn_out, paired.tumor_data) if "sv" not in paired.tumor_data: paired.tumor_data["sv"] = [] paired.tumor_data["sv"].append(purecn_out) out.append(paired.tumor_data) return out
def run_parallel(fn_name, items, metadata=None): items = [x for x in items if x is not None] if len(items) == 0: return [] items = diagnostics.track_parallel(items, fn_name) imodule = parallel.get("module", "bcbio.distributed") sysinfo = system.get_info(dirs, parallel) if parallel["type"].startswith("messaging"): task_module = "{base}.tasks".format(base=imodule) runner_fn = runner(task_module, dirs, config, config_file) return runner_fn(fn_name, items) elif parallel["type"] == "ipython": return ipython.runner(parallel, fn_name, items, dirs["work"], sysinfo, config) else: logger.info("multiprocessing: %s" % fn_name) fn = getattr(__import__("{base}.multitasks".format(base=imodule), fromlist=["multitasks"]), fn_name) jobr = ipython.find_job_resources([fn], parallel, items, sysinfo, config) items = [ipython.add_cores_to_config(x, jobr.cores_per_job) for x in items] if joblib is None: raise ImportError("Need joblib for multiprocessing parallelization") out = [] for data in joblib.Parallel(jobr.num_jobs)(joblib.delayed(fn)(x) for x in items): if data: out.extend(data) return out
def process_alignment(data): """Do an alignment of fastq files, preparing a sorted BAM output file. """ fastq1, fastq2 = data["files"] config = data["config"] aligner = config["algorithm"].get("aligner", None) out_bam = "" if os.path.exists(fastq1) and aligner: logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner)) out_bam = align_to_sort_bam(fastq1, fastq2, aligner, data) elif os.path.exists(fastq1) and fastq1.endswith(".bam"): sort_method = config["algorithm"].get("bam_sort") bamclean = config["algorithm"].get("bam_clean") if sort_method: runner = broad.runner_from_config(config) out_file = os.path.join(data["dirs"]["work"], "{}-sort.bam".format( os.path.splitext(os.path.basename(fastq1))[0])) out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file) elif bamclean is True or bamclean == "picard": out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], data["sam_ref"], data["dirs"], config) else: out_bam = link_bam_file(fastq1, os.path.join(data["dirs"]["work"], "prealign", data["rgnames"]["sample"])) _check_prealigned_bam(fastq1, data["sam_ref"], config) if not out_bam and not os.path.exists(fastq1): raise ValueError("Could not find input file: %s" % fastq1) data["work_bam"] = out_bam return [[data]]
def gatk_indel_realignment(runner, align_bam, ref_file, intervals, region=None, out_file=None, deep_coverage=False): """Perform realignment of BAM file in specified regions """ if out_file is None: out_file = "%s-realign.bam" % os.path.splitext(align_bam)[0] if not file_exists(out_file): with curdir_tmpdir() as tmp_dir: with file_transaction(out_file) as tx_out_file: logger.info("GATK IndelRealigner: %s %s" % (os.path.basename(align_bam), region)) params = ["-T", "IndelRealigner", "-I", align_bam, "-R", ref_file, "-targetIntervals", intervals, "-o", tx_out_file, "-l", "INFO", ] if region: params += ["-L", region] if deep_coverage: params += ["--maxReadsInMemory", "300000", "--maxReadsForRealignment", str(int(5e5)), "--maxReadsForConsensuses", "500", "--maxConsensuses", "100"] try: runner.run_gatk(params, tmp_dir) except: logger.exception("Running GATK IndelRealigner failed: {} {}".format( os.path.basename(align_bam), region)) raise return out_file
def run(self, config, config_file, parallel, dirs, samples): with prun.start(_wres(parallel, ["picard", "AlienTrimmer"]), samples, config, dirs, "trimming") as run_parallel: with profile.report("adapter trimming", dirs): samples = run_parallel("prepare_sample", samples) samples = run_parallel("trim_sample", samples) with prun.start(_wres(parallel, ["aligner", "picard"], ensure_mem={"tophat": 8, "tophat2": 8, "star": 40}), samples, config, dirs, "multicore", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: with profile.report("alignment", dirs): samples = disambiguate.split(samples) samples = run_parallel("process_alignment", samples) with prun.start(_wres(parallel, ["samtools", "cufflinks"]), samples, config, dirs, "rnaseqcount") as run_parallel: with profile.report("disambiguation", dirs): samples = disambiguate.resolve(samples, run_parallel) with profile.report("transcript assembly", dirs): samples = rnaseq.assemble_transcripts(run_parallel, samples) with profile.report("estimate expression", dirs): samples = rnaseq.estimate_expression(samples, run_parallel) with prun.start(_wres(parallel, ["picard", "fastqc", "rnaseqc","kraken"]), samples, config, dirs, "persample") as run_parallel: with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) logger.info("Timing: finished") return samples
def process_alignment(data, alt_input=None): """Do an alignment of fastq files, preparing a sorted BAM output file. """ data = utils.to_single_data(data) fastq1, fastq2 = dd.get_input_sequence_files(data) if alt_input: fastq1, fastq2 = alt_input config = data["config"] aligner = config["algorithm"].get("aligner", None) if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner: logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner)) data = align_to_sort_bam(fastq1, fastq2, aligner, data) if dd.get_umi_consensus(data): data["umi_bam"] = dd.get_work_bam(data) if fastq2: f1, f2 = postalign.umi_consensus(data) del data["config"]["algorithm"]["umi_type"] data["config"]["algorithm"]["mark_duplicates"] = False data = align_to_sort_bam(f1, f2, aligner, data) data = _add_supplemental_bams(data) elif fastq1 and objectstore.file_exists_or_remote( fastq1) and fastq1.endswith(".bam"): sort_method = config["algorithm"].get("bam_sort") bamclean = config["algorithm"].get("bam_clean") if bamclean is True or bamclean == "picard": if sort_method and sort_method != "coordinate": raise ValueError( "Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s" % sort_method) out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"], data) elif bamclean == "fixrg": out_bam = cleanbam.fixrg(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"], data) elif sort_method: runner = broad.runner_from_path("picard", config) out_file = os.path.join( data["dirs"]["work"], "{}-sort.bam".format( os.path.splitext(os.path.basename(fastq1))[0])) out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file) else: out_bam = link_bam_file( fastq1, os.path.join(data["dirs"]["work"], "prealign", data["rgnames"]["sample"])) bam.index(out_bam, data["config"]) bam.check_header(out_bam, data["rgnames"], dd.get_ref_file(data), data["config"]) dedup_bam = postalign.dedup_bam(out_bam, data) bam.index(dedup_bam, data["config"]) data["work_bam"] = dedup_bam elif fastq1 and objectstore.file_exists_or_remote( fastq1) and fastq1.endswith(".cram"): data["work_bam"] = fastq1 elif fastq1 is None and "vrn_file" in data: data["config"]["algorithm"]["variantcaller"] = False data["work_bam"] = None elif not fastq1: raise ValueError("No 'files' specified for input sample: %s" % dd.get_sample_name(data)) else: raise ValueError( "Could not process input file from sample configuration. \n" + fastq1 + "\nIs the path to the file correct or is empty?\n" + "If it is a fastq file (not pre-aligned BAM or CRAM), " "is an aligner specified in the input configuration?") if data.get("work_bam"): # Add stable 'align_bam' target to use for retrieving raw alignment data["align_bam"] = data["work_bam"] data = _add_hla_files(data) return [[data]]
def variant2pipeline(config, run_info_yaml, parallel, dirs, samples): ## Alignment and preparation requiring the entire input file (multicore cluster) with prun.start( _wres( parallel, ["aligner", "samtools", "sambamba"], (["reference", "fasta"], ["reference", "aligner"], ["files"])), samples, config, dirs, "multicore", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: with profile.report("organize samples", dirs): samples = run_parallel("organize_samples", [[ dirs, config, run_info_yaml, [x[0]["description"] for x in samples] ]]) with profile.report("alignment preparation", dirs): samples = run_parallel("prep_align_inputs", samples) samples = run_parallel("disambiguate_split", [samples]) with profile.report("alignment", dirs): samples = run_parallel("process_alignment", samples) samples = disambiguate.resolve(samples, run_parallel) samples = alignprep.merge_split_alignments(samples, run_parallel) with profile.report("callable regions", dirs): samples = run_parallel("prep_samples", [samples]) samples = run_parallel("postprocess_alignment", samples) samples = run_parallel("combine_sample_regions", [samples]) samples = run_parallel("calculate_sv_bins", [samples]) samples = run_parallel("calculate_sv_coverage", samples) samples = region.clean_sample_data(samples) with profile.report("hla typing", dirs): samples = hla.run(samples, run_parallel) ## Variant calling on sub-regions of the input file (full cluster) with prun.start(_wres(parallel, ["gatk", "picard", "variantcaller"]), samples, config, dirs, "full", multiplier=region.get_max_counts(samples), max_multicore=1) as run_parallel: with profile.report("alignment post-processing", dirs): samples = region.parallel_prep_region(samples, run_parallel) with profile.report("variant calling", dirs): samples = genotype.parallel_variantcall_region( samples, run_parallel) ## Finalize variants, BAMs and population databases (per-sample multicore cluster) with prun.start(_wres(parallel, [ "gatk", "gatk-vqsr", "snpeff", "bcbio_variation", "gemini", "samtools", "fastqc", "sambamba", "bcbio-variation-recall", "qsignature", "svcaller", "kraken", "preseq" ]), samples, config, dirs, "multicore2", multiplier=structural.parallel_multiplier( samples)) as run_parallel: with profile.report("joint squaring off/backfilling", dirs): samples = joint.square_off(samples, run_parallel) with profile.report("variant post-processing", dirs): samples = run_parallel("postprocess_variants", samples) samples = run_parallel("split_variants_by_sample", samples) with profile.report("prepped BAM merging", dirs): samples = region.delayed_bamprep_merge(samples, run_parallel) with profile.report("validation", dirs): samples = run_parallel("compare_to_rm", samples) samples = genotype.combine_multiple_callers(samples) with profile.report("ensemble calling", dirs): samples = ensemble.combine_calls_parallel(samples, run_parallel) with profile.report("validation summary", dirs): samples = validate.summarize_grading(samples) with profile.report("structural variation precall", dirs): samples = structural.run(samples, run_parallel, "precall") with profile.report("structural variation", dirs): samples = structural.run(samples, run_parallel, "initial") with profile.report("structural variation", dirs): samples = structural.run(samples, run_parallel, "standard") with profile.report("structural variation ensemble", dirs): samples = structural.run(samples, run_parallel, "ensemble") with profile.report("structural variation validation", dirs): samples = run_parallel("validate_sv", samples) with profile.report("heterogeneity", dirs): samples = heterogeneity.run(samples, run_parallel) with profile.report("population database", dirs): samples = population.prep_db_parallel(samples, run_parallel) with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) with profile.report("archive", dirs): samples = archive.compress(samples, run_parallel) with profile.report("upload", dirs): samples = run_parallel("upload_samples", samples) for sample in samples: run_parallel("upload_samples_project", [sample]) logger.info("Timing: finished") return samples
def _report_summary(samples, out_dir): """ Run coverage report with bcbiocov package """ try: import bcbreport.prepare as bcbreport except ImportError: logger.info("skipping report. No bcbreport installed.") return samples # samples = utils.unpack_worlds(samples) work_dir = dd.get_work_dir(samples[0]) parent_dir = utils.safe_makedir(out_dir) with utils.chdir(parent_dir): logger.info("copy qsignature") qsignature_fn = os.path.join(work_dir, "qc", "qsignature", "qsignature.ma") if qsignature_fn: # this need to be inside summary/qc dict if utils.file_exists( qsignature_fn) and not utils.file_exists("qsignature.ma"): shutil.copy(qsignature_fn, "bcbio_qsignature.ma") out_dir = utils.safe_makedir("fastqc") logger.info("summarize fastqc") with utils.chdir(out_dir): _merge_fastqc(samples) logger.info("summarize metrics") samples = _merge_metrics(samples) logger.info("summarize target information") if samples[0].get("analysis", "").lower() in ["variant", "variant2"]: samples = _merge_target_information(samples) out_dir = utils.safe_makedir("coverage") logger.info("summarize coverage") for data in samples: pfiles = tz.get_in(["summary", "qc", "coverage"], data, []) if isinstance(pfiles, dict): pfiles = [pfiles["base"]] + pfiles["secondary"] elif pfiles: pfiles = [pfiles] for fn in pfiles: if os.path.basename(fn).find("coverage_fixed") > -1: utils.copy_plus( fn, os.path.join(out_dir, os.path.basename(fn))) out_dir = utils.safe_makedir("variants") logger.info("summarize variants") for data in samples: pfiles = tz.get_in(["summary", "qc", "variants"], data, []) if isinstance(pfiles, dict): pfiles = [pfiles["base"]] + pfiles["secondary"] elif pfiles: pfiles = [pfiles] for fn in pfiles: if os.path.basename(fn).find("gc-depth-parse.tsv") > -1: utils.copy_plus( fn, os.path.join(out_dir, os.path.basename(fn))) bcbreport.report(parent_dir) out_report = os.path.join(parent_dir, "qc-coverage-report.html") if not utils.file_exists(out_report): rmd_file = os.path.join(parent_dir, "report-ready.Rmd") run_file = "%s-run.R" % (os.path.splitext(out_report)[0]) with open(run_file, "w") as out_handle: out_handle.write("""library(rmarkdown)\nrender("%s")\n""" % rmd_file) # cmd = "%s %s" % (utils.Rscript_cmd(), run_file) # Skip automated generation of coverage report to avoid error # messages. We need to generalize coverage reporting and re-include. # try: # do.run(cmd, "Prepare coverage summary", log_error=False) # except subprocess.CalledProcessError as msg: # logger.info("Skipping generation of coverage report: %s" % (str(msg))) if utils.file_exists("report-ready.html"): shutil.move("report-ready.html", out_report) return samples
def consensus(peakfiles, consensusfile, data, pad=250): """call consensus peaks from a set of narrow/broad peakfiles we use this method: https://bedops.readthedocs.io/en/latest/content/usage-examples/master-list.html """ if utils.file_exists(consensusfile): return consensusfile try: bedops = config_utils.get_program("bedops", data) except config_utils.CmdNotFound: logger.info("bedops not found, skipping consensus peak calling. do a " "--tools update to install bedops.") return None try: sortbed = config_utils.get_program("sort-bed", data) except config_utils.CmdNotFound: logger.info("sort-bed not found, skipping consensus peak calling. do " "--tools update to install sort-bed.") return None try: bedmap = config_utils.get_program("bedmap", data) except config_utils.CmdNotFound: logger.info("bedmap not found, skipping consensus peak calling. do a " "--tools update to install bedmap.") return None logger.info(f"Calling consensus peaks on {','.join(peakfiles)}") logger.info(f"Removing low quality peaks from {','.join(peakfiles)}") filteredsummits = [] for fn in peakfiles: filteredpeak = NamedTemporaryFile(suffix=".bed", delete=False).name df = remove_low_quality_peaks(fn, qval=0.05) df.to_csv(filteredpeak, index=False, header=False, sep="\t") filteredsummit = peakfile_to_summitfile(filteredpeak) filteredsummits.append(filteredsummit) peakfiles = filteredsummits with file_transaction(consensusfile) as tx_consensus_file: message = (f"Combining summits of {' '.join(peakfiles)} and " f"expanding {pad} bases.") with utils.tmpfile(suffix=".bed") as tmpbed: slopcommand = f"{bedops} --range {pad} -u {' '.join(peakfiles)} > {tmpbed}" do.run(slopcommand, message) iteration = 0 solutions = [] while os.path.getsize(tmpbed): iteration = iteration + 1 iterationbed = NamedTemporaryFile(suffix=".bed", delete=False).name with utils.tmpfile(suffix="bed") as mergedbed, \ utils.tmpfile(suffix="bed") as intermediatebed, \ utils.tmpfile(suffix="bed") as leftoverbed: mergecmd = (f"{bedops} -m --range 0:-1 {tmpbed} | " f"{bedops} -u --range 0:1 - > " f"{mergedbed}") message = f"Merging non-overlapping peaks, iteration {iteration}." do.run(mergecmd, message) nitems = len(open(mergedbed).readlines()) message = f"Considering {nitems} peaks, choosing the highest score for overlapping peaks." highscorecmd = ( f"{bedmap} --max-element {mergedbed} {tmpbed} |" f"{sortbed} - > " f"{iterationbed}") do.run(highscorecmd, message) message = f"Checking if there are peaks left to merge." anyleftcmd = ( f"{bedops} -n 1 {tmpbed} {iterationbed} > {intermediatebed}" ) do.run(anyleftcmd, message) shutil.move(intermediatebed, tmpbed) solutions.append(iterationbed) message = f"Creating final consensus peak file: {consensusfile}." consensuscmd = ( f"{bedops} -u {' '.join(solutions)} > {tx_consensus_file}") do.run(consensuscmd, message) return consensusfile
def postprocess_variants(items): """Provide post-processing of variant calls: filtering and effects annotation. """ vrn_key = "vrn_file" if not isinstance(items, dict): items = [utils.to_single_data(x) for x in items] if "vrn_file_joint" in items[0]: vrn_key = "vrn_file_joint" data, items = _get_batch_representative(items, vrn_key) items = cwlutils.unpack_tarballs(items, data) data = cwlutils.unpack_tarballs(data, data) cur_name = "%s, %s" % (dd.get_sample_name(data), get_variantcaller(data)) logger.info("Finalizing variant calls: %s" % cur_name) orig_vrn_file = data.get(vrn_key) data = _symlink_to_workdir(data, [vrn_key]) data = _symlink_to_workdir(data, ["config", "algorithm", "variant_regions"]) if data.get(vrn_key): logger.info("Calculating variation effects for %s" % cur_name) ann_vrn_file, vrn_stats = effects.add_to_vcf(data[vrn_key], data) if ann_vrn_file: data[vrn_key] = ann_vrn_file if vrn_stats: data["vrn_stats"] = vrn_stats orig_items = _get_orig_items(items) logger.info("Annotate VCF file: %s" % cur_name) data[vrn_key] = annotation.finalize_vcf(data[vrn_key], get_variantcaller(data), orig_items) logger.info("Filtering for %s" % cur_name) data[vrn_key] = variant_filtration(data[vrn_key], dd.get_ref_file(data), tz.get_in(("genome_resources", "variation"), data, {}), data, orig_items) logger.info("Prioritization for %s" % cur_name) prio_vrn_file = prioritize.handle_vcf_calls(data[vrn_key], data, orig_items) if prio_vrn_file != data[vrn_key]: data[vrn_key] = prio_vrn_file logger.info("Germline extraction for %s" % cur_name) data = germline.extract(data, orig_items) if dd.get_align_bam(data): data = damage.run_filter(data[vrn_key], dd.get_align_bam(data), dd.get_ref_file(data), data, orig_items) if orig_vrn_file and os.path.samefile(data[vrn_key], orig_vrn_file): data[vrn_key] = orig_vrn_file return [[data]]
def htseq_count(data): """ adapted from Simon Anders htseq-count.py script http://www-huber.embl.de/users/anders/HTSeq/doc/count.html """ sam_filename, gff_filename, out_file, stats_file = _get_files(data) stranded = _get_stranded_flag(data["config"]) overlap_mode = "union" feature_type = "exon" id_attribute = "gene_id" minaqual = 0 if file_exists(out_file): return out_file logger.info( "Counting reads mapping to exons in %s using %s as the " "annotation and strandedness as %s." % (os.path.basename(sam_filename), os.path.basename(gff_filename), _get_strandedness(data["config"]))) features = HTSeq.GenomicArrayOfSets("auto", stranded != "no") counts = {} # Try to open samfile to fail early in case it is not there open(sam_filename).close() gff = HTSeq.GFF_Reader(gff_filename) i = 0 try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[id_attribute] except KeyError: sys.exit("Feature %s does not contain a '%s' attribute" % (f.name, id_attribute)) if stranded != "no" and f.iv.strand == ".": sys.exit("Feature %s at %s does not have strand " "information but you are running htseq-count " "in stranded mode. Use '--stranded=no'." % (f.name, f.iv)) features[f.iv] += feature_id counts[f.attr[id_attribute]] = 0 i += 1 if i % 100000 == 0: sys.stderr.write("%d GFF lines processed.\n" % i) except: sys.stderr.write("Error occured in %s.\n" % gff.get_line_number_string()) raise sys.stderr.write("%d GFF lines processed.\n" % i) if len(counts) == 0: sys.stderr.write("Warning: No features of type '%s' found.\n" % feature_type) try: align_reader = htseq_reader(sam_filename) first_read = iter(align_reader).next() pe_mode = first_read.paired_end except: sys.stderr.write("Error occured when reading first line of sam " "file.\n") raise try: if pe_mode: read_seq_pe_file = align_reader read_seq = HTSeq.pair_SAM_alignments(align_reader) empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: i += 1 if not pe_mode: if not r.aligned: notaligned += 1 continue try: if r.optional_field("NH") > 1: nonunique += 1 continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = (co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M" and co.size > 0)) else: iv_seq = itertools.chain( iv_seq, (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): notaligned += 1 continue try: if (r[0] is not None and r[0].optional_field("NH") > 1) or \ (r[1] is not None and r[1].optional_field("NH") > 1): nonunique += 1 continue except KeyError: pass if (r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual): lowqual += 1 continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif (overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty"): fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if (len(fs2) > 0 or overlap_mode == "intersection-strict"): if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is None or len(fs) == 0: empty += 1 elif len(fs) > 1: ambiguous += 1 else: counts[list(fs)[0]] += 1 except UnknownChrom: if not pe_mode: rr = r else: rr = r[0] if r[0] is not None else r[1] empty += 1 if i % 100000 == 0: sys.stderr.write( "%d sam %s processed.\n" % (i, "lines " if not pe_mode else "line pairs")) except: if not pe_mode: sys.stderr.write("Error occured in %s.\n" % read_seq.get_line_number_string()) else: sys.stderr.write("Error occured in %s.\n" % read_seq_pe_file.get_line_number_string()) raise sys.stderr.write("%d sam %s processed.\n" % (i, "lines " if not pe_mode else "line pairs")) with file_transaction(out_file) as tmp_out_file: with open(tmp_out_file, "w") as out_handle: on_feature = 0 for fn in sorted(counts.keys()): on_feature += counts[fn] out_handle.write("%s\t%d\n" % (fn, counts[fn])) with file_transaction(stats_file) as tmp_stats_file: with open(tmp_stats_file, "w") as out_handle: out_handle.write("on_feature\t%d\n" % on_feature) out_handle.write("no_feature\t%d\n" % empty) out_handle.write("ambiguous\t%d\n" % ambiguous) out_handle.write("too_low_aQual\t%d\n" % lowqual) out_handle.write("not_aligned\t%d\n" % notaligned) out_handle.write("alignment_not_unique\t%d\n" % nonunique) return out_file
def _run_purecn_normaldb(paired, out): """Run PureCN with normaldb and native segmentation paired is one t/n pair or only """ sample = utils.to_single_data(paired.tumor_data) bed_file = tz.get_in(["config", "algorithm", "purecn_bed_ready"], sample) sample_name = dd.get_sample_name(sample) work_dir = _sv_workdir(sample) rscript = utils.Rscript_cmd() purecn_r = utils.R_package_script("PureCN", "extdata/PureCN.R", env="base") intervals = tz.get_in(["config", "algorithm", "purecn_bed_ready"], sample) bam_file = dd.get_align_bam(sample) # termline and somatic - just annotated and filters assigned variants_vcf = tz.get_in(["variants"], sample)[0].get("germline") # in a T/N case, there is no germline file - vrn file with all variants if not variants_vcf: variants_vcf = tz.get_in(["variants"], sample)[0].get("vrn_file") normaldb = tz.get_in([ "config", "algorithm", "background", "cnv_reference", "purecn_normaldb" ], sample) mappingbiasfile = tz.get_in([ "config", "algorithm", "background", "cnv_reference", "purecn_mapping_bias" ], sample) sample_coverage = tz.get_in(["depth", "bins", "purecn"], sample) simple_repeat_bed = dd.get_variation_resources(sample)["simple_repeat"] result_file = os.path.join(work_dir, sample_name + ".rds") genome = dd.get_genome_build(sample) cmd = [ rscript, purecn_r, "--out", work_dir, "--tumor", sample_coverage, "--sampleid", sample_name, "--vcf", variants_vcf, "--normaldb", normaldb, "--mapping-bias-file", mappingbiasfile, "--intervals", intervals, "--snp-blacklist", simple_repeat_bed, "--genome", genome, "--force", "--post-optimize", "--seed", "123", "--bootstrapn", "500", "--cores", dd.get_num_cores(sample) ] resources = config_utils.get_resources("purecn", sample) if "options" in resources: cmd += [str(x) for x in resources.get("options", [])] # it is not recommended to use matched normal sample in PureCN analysis, # because then it skips PON coverage normalization and denoising steps! # but still, if it is supplied, we useit if paired.normal_data: normal_sample = utils.to_single_data(paired.normal_data) if normal_sample: normal_coverage = tz.get_in(["depth", "bins", "purecn"], normal_sample) cmd.extend(["--normal", normal_coverage]) if not os.path.exists(result_file): try: cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib( env="base"), utils.get_R_exports(env="base"), " ".join( [str(x) for x in cmd])) do.run(cmd_line, "PureCN copy number calling") logger.debug("Saved PureCN output to " + work_dir) except subprocess.CalledProcessError as msg: logger.info("PureCN failed") out_base, out, all_files = _get_purecn_files(paired, work_dir, require_exist=True) return out
def _run_purecn(paired, work_dir): """Run PureCN.R wrapper with pre-segmented CNVkit or GATK4 inputs. """ segfns = { "cnvkit": _segment_normalized_cnvkit, "gatk-cnv": _segment_normalized_gatk } out_base, out, all_files = _get_purecn_files(paired, work_dir) failed_file = out_base + "-failed.log" cnr_file = tz.get_in(["depth", "bins", "normalized"], paired.tumor_data) if not utils.file_uptodate( out["rds"], cnr_file) and not utils.file_exists(failed_file): cnr_file, seg_file = segfns[cnvkit.bin_approach(paired.tumor_data)]( cnr_file, work_dir, paired) from bcbio import heterogeneity vcf_file = heterogeneity.get_variants( paired.tumor_data, include_germline=False)[0]["vrn_file"] vcf_file = germline.filter_to_pass_and_reject(vcf_file, paired, out_dir=work_dir) with file_transaction(paired.tumor_data, out_base) as tx_out_base: # Use UCSC style naming for human builds to support BSgenome genome = ("hg19" if dd.get_genome_build(paired.tumor_data) in [ "GRCh37", "hg19" ] else dd.get_genome_build(paired.tumor_data)) rscript = utils.Rscript_cmd() purecn_r = utils.R_package_script("PureCN", "extdata/PureCN.R", env="base") cmd = [ rscript, purecn_r, "--seed", "42", "--out", tx_out_base, "--rds", "%s.rds" % tx_out_base, "--sampleid", dd.get_sample_name(paired.tumor_data), "--genome", genome, "--vcf", vcf_file, "--tumor", cnr_file, "--segfile", seg_file, "--funsegmentation", "Hclust", "--maxnonclonal", "0.3" ] if dd.get_num_cores(paired.tumor_data) > 1: cmd += ["--cores", str(dd.get_num_cores(paired.tumor_data))] try: cmd = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib( env="base"), utils.get_R_exports(env="base"), " ".join( [str(x) for x in cmd])) do.run(cmd, "PureCN copy number calling") except subprocess.CalledProcessError as msg: if _allowed_errors(str(msg)): logger.info( "PureCN failed to find solution for %s: skipping" % dd.get_sample_name(paired.tumor_data)) with open(failed_file, "w") as out_handle: out_handle.write(str(msg)) else: logger.exception() raise for f in all_files: if os.path.exists(os.path.join(os.path.dirname(tx_out_base), f)): shutil.move(os.path.join(os.path.dirname(tx_out_base), f), os.path.join(os.path.dirname(out_base), f)) out = _get_purecn_files(paired, work_dir, require_exist=True)[1] return out if (out.get("rds") and os.path.exists(out["rds"])) else None
def _cut_file(self, in_file): """ run cutadapt on a single file """ adapters = self._get_adapters(self.chemistry) out_file = self.in2trimmed(in_file) if file_exists(out_file): return out_file cutadapt = sh.Command(self.stage_config.get("program", "cutadapt")) quality_format = self.quality_format if not quality_format: quality_format = self._detect_fastq_format(in_file) if quality_format == "sanger": logger.info("Quality format detected as sanger.") quality_base = 33 elif quality_format == "illumina": logger.info("Quality format set to illumina 1.5/1.3") quality_base = 64 else: logger.error("Quality format could not be detected. Quality " "Detected or set as %s. It should be illumina " "or sanger.") exit(1) # if we want to trim the polya tails we have to first remove # the adapters and then trim the tail if self.stage_config.get("trim_polya", True): temp_cut = tempfile.NamedTemporaryFile(suffix=".fastq", dir=self.out_dir) # trim off adapters cmd = str( cutadapt.bake(in_file, self.options, adapters, quality_base=quality_base, out=temp_cut.name)) do.run(cmd, "Cutadapt trim of adapters of %s." % (in_file), None) with file_transaction(out_file) as temp_out: polya = ADAPTERS.get("polya") # trim off polya cmd = str( cutadapt.bake(temp_cut.name, self.options, "-a", polya, "-a", self._rc_adapters(polya), quality_base=quality_base, out=temp_out)) do.run(cmd, "Cutadapt trim of polyA tail of %s." % (temp_cut.name), None) return out_file else: with file_transaction(out_file) as temp_out: cmd = str( cutadapt.bake(in_file, self.options, adapters, out=temp_out)) do.run(cmd, "Cutadapt trim of %s." % (in_file)) return out_file
def run_peddy(samples, out_dir=None): vcf_file = None for d in samples: vcinfo = variant.get_active_vcinfo(d, use_ensemble=False) if vcinfo and vcinfo.get("vrn_file") and utils.file_exists( vcinfo["vrn_file"]): if vcinfo["vrn_file"] and dd.get_sample_name( d) in vcfutils.get_samples(vcinfo["vrn_file"]): vcf_file = vcinfo["vrn_file"] break data = samples[0] peddy = config_utils.get_program("peddy", data) if config_utils.program_installed( "peddy", data) else None if not peddy or not vcf_file or not is_human(data): logger.info( "peddy is not installed, not human or sample VCFs don't match, skipping correspondence checking " "for %s." % vcf_file) return samples batch = dd.get_batch(data) or dd.get_sample_name(data) if out_dir: peddy_dir = safe_makedir(out_dir) else: peddy_dir = safe_makedir( os.path.join(dd.get_work_dir(data), "qc", batch, "peddy")) ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir) peddy_prefix = os.path.join(peddy_dir, batch) peddy_report = peddy_prefix + ".html" peddyfiles = expected_peddy_files(peddy_report, batch) if file_exists(peddy_report): return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles) if file_exists(peddy_prefix + "-failed.log"): return samples num_cores = dd.get_num_cores(data) with tx_tmpdir(data) as tx_dir: peddy_prefix_tx = os.path.join(tx_dir, os.path.basename(peddy_prefix)) # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2 stderr_log = os.path.join(tx_dir, "run-stderr.log") sites_str = "--sites hg38" if dd.get_genome_build( data) == "hg38" else "" cmd = ( "{peddy} -p {num_cores} {sites_str} --plot --prefix {peddy_prefix_tx} " "{vcf_file} {ped_file} 2> {stderr_log}") message = "Running peddy on {vcf_file} against {ped_file}." try: do.run(cmd.format(**locals()), message.format(**locals())) except: to_show = collections.deque(maxlen=100) with open(stderr_log) as in_handle: for line in in_handle: to_show.append(line) def allowed_errors(l): return ((l.find("IndexError") >= 0 and l.find("is out of bounds for axis") >= 0) or (l.find("n_components=") >= 0 and l.find("must be between 1 and n_features=") >= 0)) if any([allowed_errors(l) for l in to_show]): logger.info( "Skipping peddy because no variants overlap with checks: %s" % batch) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write( "peddy did not find overlaps with 1kg sites in VCF, skipping" ) return samples else: logger.warning("".join(to_show)) raise for ext in PEDDY_OUT_EXTENSIONS: if os.path.exists(peddy_prefix_tx + ext): shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext) return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
def check(): ok = os.path.exists(target_file) if not ok: logger.info("Did not find output file {0}".format(target_file)) return ok
def _bgzip_from_bam(bam_file, dirs, data, is_retry=False, output_infix=''): """Create bgzipped fastq files from an input BAM file. """ # tools config = data["config"] bamtofastq = config_utils.get_program("bamtofastq", config) resources = config_utils.get_resources("bamtofastq", config) cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) * cores bgzip = tools.get_bgzip_cmd(config, is_retry) # files work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep")) out_file_1 = os.path.join( work_dir, "%s%s-1.fq.gz" % (os.path.splitext(os.path.basename(bam_file))[0], output_infix)) out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz") needs_retry = False if is_retry or not utils.file_exists(out_file_1): if not bam.is_paired(bam_file): out_file_2 = None with file_transaction(config, out_file_1) as tx_out_file: for f in [tx_out_file, out_file_1, out_file_2]: if f and os.path.exists(f): os.remove(f) fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file) prep_cmd = _seqtk_fastq_prep_cl(data, read_num=0) if prep_cmd: fq1_bgzip_cmd = prep_cmd + " | " + fq1_bgzip_cmd sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0] if bam.is_paired(bam_file): prep_cmd = _seqtk_fastq_prep_cl(data, read_num=1) fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2) if prep_cmd: fq2_bgzip_cmd = prep_cmd + " | " + fq2_bgzip_cmd out_str = ( "F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null " "O2=/dev/null collate=1 colsbs={max_mem}") else: out_str = "S=>({fq1_bgzip_cmd})" bam_file = objectstore.cl_input(bam_file) extra_opts = " ".join( [str(x) for x in resources.get("options", [])]) cmd = "{bamtofastq} filename={bam_file} T={sortprefix} {extra_opts} " + out_str try: do.run(cmd.format(**locals()), "BAM to bgzipped fastq", checks=[do.file_reasonable_size(tx_out_file, bam_file)], log_error=False) except subprocess.CalledProcessError as msg: if not is_retry and "deflate failed" in str(msg): logger.info( "bamtofastq deflate IO failure preparing %s. Retrying with single core." % (bam_file)) needs_retry = True else: logger.exception() raise if needs_retry: return _bgzip_from_bam(bam_file, dirs, data, is_retry=True) else: return [ x for x in [out_file_1, out_file_2] if x is not None and utils.file_exists(x) ]
def run_peddy(samples, out_dir=None): data = samples[0] batch = dd.get_batch(data) or dd.get_sample_name(data) if isinstance(batch, (list, tuple)): batch = batch[0] if out_dir: peddy_dir = safe_makedir(out_dir) else: peddy_dir = safe_makedir( os.path.join(dd.get_work_dir(data), "qc", batch, "peddy")) peddy_prefix = os.path.join(peddy_dir, batch) peddy_report = peddy_prefix + ".html" vcf_file = None for d in samples: vcinfo = None if dd.get_phenotype(d) == "germline" or dd.get_phenotype(d) not in [ "tumor" ]: vcinfo = variant.get_active_vcinfo(d, use_ensemble=False) if not vcinfo and dd.get_phenotype(d) in ["tumor"]: vcinfo = variant.extract_germline_vcinfo(d, peddy_dir) if vcinfo: for key in ["germline", "vrn_file"]: if vcinfo and vcinfo.get(key) and utils.file_exists( vcinfo[key]): if vcinfo[key] and dd.get_sample_name( d) in vcfutils.get_samples(vcinfo[key]): if vcinfo[ key] and vcfutils.vcf_has_nonfiltered_variants( vcinfo[key]): vcf_file = vcinfo[key] break peddy = config_utils.get_program("peddy", data) if config_utils.program_installed( "peddy", data) else None if not peddy or not vcf_file or not vcfanno.is_human(data): if not peddy: reason = "peddy executable not found" elif not vcfanno.is_human(data): reason = "sample is not human" else: assert not vcf_file reason = "no suitable VCF files found with the sample and non-filtered variants" msg = "Skipping peddy QC, %s: %s" % ( reason, [dd.get_sample_name(d) for d in samples]) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write(msg) logger.info(msg) return samples if file_exists(peddy_prefix + "-failed.log"): return samples if not file_exists(peddy_report): ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir) num_cores = dd.get_num_cores(data) with tx_tmpdir(data) as tx_dir: peddy_prefix_tx = os.path.join(tx_dir, os.path.basename(peddy_prefix)) # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2 stderr_log = os.path.join(tx_dir, "run-stderr.log") sites_str = "--sites hg38" if dd.get_genome_build( data) == "hg38" else "" cmd = ( "{peddy} -p {num_cores} {sites_str} --plot --prefix {peddy_prefix_tx} " "{vcf_file} {ped_file} 2> {stderr_log}") message = "Running peddy on {vcf_file} against {ped_file}." try: do.run(cmd.format(**locals()), message.format(**locals())) except: to_show = collections.deque(maxlen=100) with open(stderr_log) as in_handle: for line in in_handle: to_show.append(line) def allowed_errors(l): return (( l.find("IndexError") >= 0 and l.find("is out of bounds for axis") >= 0 ) or ( l.find("n_components=") >= 0 and l.find("must be between 1 and n_features=") >= 0 ) or (l.find( "Input contains NaN, infinity or a value too large for dtype" ) >= 0)) def all_line_errors(l): return (l.find("no intervals found for") >= 0) if any([allowed_errors(l) for l in to_show]) or all( [all_line_errors(l) for l in to_show]): logger.info( "Skipping peddy because no variants overlap with checks: %s" % batch) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write( "peddy did not find overlaps with 1kg sites in VCF, skipping" ) return samples else: logger.warning("".join(to_show)) raise for ext in PEDDY_OUT_EXTENSIONS: if os.path.exists(peddy_prefix_tx + ext): shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext) peddyfiles = expected_peddy_files(peddy_report, batch) return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
def check(): ok = utils.file_exists(target_file) if not ok: logger.info( "Did not find non-empty output file {0}".format(target_file)) return ok
def umi_transform(data): """ transform each read by identifying the barcode and UMI for each read and putting the information in the read name """ fqfiles = data["files"] fqfiles.extend(list(repeat("", 4 - len(fqfiles)))) fq1, fq2, fq3, fq4 = fqfiles umi_dir = os.path.join(dd.get_work_dir(data), "umis") safe_makedir(umi_dir) transform = dd.get_umi_type(data) if not transform: logger.info( "No UMI transform specified, assuming pre-transformed data.") if is_transformed(fq1): logger.info( "%s detected as pre-transformed, passing it on unchanged." % fq1) data["files"] = [fq1] return [[data]] else: logger.error( "No UMI transform was specified, but %s does not look " "pre-transformed." % fq1) sys.exit(1) if file_exists(transform): transform_file = transform else: transform_file = get_transform_file(transform) if not file_exists(transform_file): logger.error( "The UMI transform can be specified as either a file or a " "bcbio-supported transform. Either the file %s does not exist " "or the transform is not supported by bcbio. Supported " "transforms are %s." % (dd.get_umi_type(data), ", ".join(SUPPORTED_TRANSFORMS))) sys.exit(1) out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] cellular_barcodes = get_cellular_barcodes(data) if len(cellular_barcodes) > 1: split_option = "--separate_cb" else: split_option = "" if dd.get_demultiplexed(data): demuxed_option = "--demuxed_cb %s" % dd.get_sample_name(data) split_option = "" else: demuxed_option = "" umis = config_utils.get_program("umis", data, default="umis") cores = dd.get_num_cores(data) # skip transformation if the file already looks transformed with open_fastq(fq1) as in_handle: read = next(in_handle) if "UMI_" in read: data["files"] = [out_file] return [[data]] locale_export = utils.locale_export() cmd = ( "{locale_export}{umis} fastqtransform {split_option} {transform_file} " "--cores {cores} {demuxed_option} " "{fq1} {fq2} {fq3} {fq4}" "| seqtk seq -L 20 - | gzip > {tx_out_file}") message = ( "Inserting UMI and barcode information into the read name of %s" % fq1) with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]
def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) log_out = os.path.join(out_dir, "%s.log" % names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) if utils.file_exists(out_file): data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) data["log_trimming"] = log_out return [[data]] adapter = dd.get_adapters(data) if adapter and not trim_reads: trim_reads = True logger.info( "Adapter is set up in config file, but trim_reads is not true." "If you want to skip trimming, skip adapter option from config.") if trim_reads and not adapter and error_dnapi: raise ValueError(error_dnapi) if trim_reads: adapters = adapter if adapter else _dnapi_prediction(in_file, out_dir) times = "" if not trim_reads or len( adapters) == 1 else "--times %s" % len(adapters) if trim_reads and adapters: adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters)) out_noadapter_file = replace_directory( append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) atropos = _get_atropos() options = " ".join( data.get('resources', {}).get('atropos', {}).get("options", "")) cores = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "") if " ".join( data.get('resources', {}).get('cutadapt', {}).get("options", "")): raise ValueError( "Atropos is now used, but cutadapt options found in YAML file." "See https://atropos.readthedocs.io/en/latest/") cmd = _cmd_atropos() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter for %s" % names) if utils.file_exists(log_out): content = open(log_out).read().replace( out_short_file, names) open(log_out, 'w').write(content) if options: in_file = append_stem(tx_out_file, ".tmp") utils.move_safe(tx_out_file, in_file) cmd = "{atropos} {cores} {options} -se {in_file} -o {tx_out_file} -m 17" do.run(cmd.format(**locals()), "cutadapt with this %s for %s" % (options, names)) data["log_trimming"] = log_out else: if not trim_reads: logger.debug("Skip trimming for: %s" % names) elif not adapters: logger.info("No adapter founds in %s, this is an issue related" " to no small RNA enrichment in your sample." % names) symlink_plus(in_file, out_file) data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def _get_samples_to_process(fn, out_dir, config, force_single, separators): """parse csv file with one line per file. It will merge all files that have the same description name""" out_dir = os.path.abspath(out_dir) samples = defaultdict(list) with open(fn) as handle: for l in handle: if l.find("description") > 0: logger.info("Skipping header.") continue cols = l.strip().split(",") if len(cols) > 0: if len(cols) < 2: raise ValueError("Line needs 2 values: file and name.") if utils.file_exists(cols[0]) or is_gsm(cols[0]) or is_srr( cols[0]): if cols[0].find(" ") > -1: new_name = os.path.abspath(cols[0].replace(" ", "_")) logger.warning("Space finds in %s. Linked to %s." % (cols[0], new_name)) logger.warning( "Please, avoid names with spaces in the future.") utils.symlink_plus(os.path.abspath(cols[0]), new_name) cols[0] = new_name samples[cols[1]].append(cols) else: logger.info("skipping %s, File doesn't exist." % cols[0]) for sample, items in samples.items(): if is_fastq(items[0][0], True): fn = "fq_merge" ext = ".fastq.gz" elif is_bam(items[0][0]): fn = "bam_merge" ext = ".bam" elif is_gsm(items[0][0]): fn = "query_gsm" ext = ".fastq.gz" elif is_srr(items[0][0]): fn = "query_srr" ext = ".fastq.gz" files = [ os.path.abspath(fn_file[0]) if utils.file_exists(fn_file[0]) else fn_file[0] for fn_file in items ] samples[sample] = [{ 'files': _check_paired(files, force_single, separators), 'out_file': os.path.join(out_dir, sample + ext), 'fn': fn, 'anno': items[0][2:], 'config': config, 'name': sample, 'out_dir': out_dir }] return [samples[sample] for sample in samples]
def umi_transform(data): """ transform each read by identifying the barcode and UMI for each read and putting the information in the read name """ fq1 = data["files"][0] umi_dir = os.path.join(dd.get_work_dir(data), "umis") safe_makedir(umi_dir) transform = dd.get_umi_type(data) if not transform: logger.info( "No UMI transform specified, assuming pre-transformed data.") if is_transformed(fq1): logger.info( "%s detected as pre-transformed, passing it on unchanged." % fq1) data["files"] = [fq1] return data else: logger.error( "No UMI transform was specified, but %s does not look " "pre-transformed. Assuming non-umi data." % fq1) return data if file_exists(transform): transform_file = transform else: transform_file = get_transform_file(transform) if not file_exists(transform_file): logger.error( "The UMI transform can be specified as either a file or a " "bcbio-supported transform. Either the file %s does not exist " "or the transform is not supported by bcbio. Supported " "transforms are %s." % (dd.get_umi_type(data), ", ".join(SUPPORTED_TRANSFORMS))) sys.exit(1) out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return data umis = config_utils.get_program("umis", data, default="umis") cores = dd.get_num_cores(data) # skip transformation if the file already looks transformed with open_fastq(fq1) as in_handle: read = in_handle.next() if "UMI_" in read: data["files"] = [out_file] return data cmd = ("{umis} fastqtransform {transform_file} " "--cores {cores} " "{fq1}" "| seqtk seq -L 20 - | gzip > {tx_out_file}") message = ( "Inserting UMI and barcode information into the read name of %s" % fq1) with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) data["files"] = [out_file] return data
def rnaseqpipeline(config, run_info_yaml, parallel, dirs, samples): samples = rnaseq_prep_samples(config, run_info_yaml, parallel, dirs, samples) with prun.start( _wres(parallel, ["aligner", "picard", "samtools"], ensure_mem={ "tophat": 10, "tophat2": 10, "star": 2, "hisat2": 8 }), samples, config, dirs, "alignment", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: with profile.report("alignment", dirs): samples = run_parallel("disambiguate_split", [samples]) samples = run_parallel("process_alignment", samples) with prun.start(_wres(parallel, ["samtools", "cufflinks"]), samples, config, dirs, "rnaseqcount") as run_parallel: with profile.report("disambiguation", dirs): samples = disambiguate.resolve(samples, run_parallel) with profile.report("transcript assembly", dirs): samples = rnaseq.assemble_transcripts(run_parallel, samples) with profile.report("estimate expression (threaded)", dirs): samples = rnaseq.quantitate_expression_parallel( samples, run_parallel) with prun.start(_wres(parallel, ["dexseq", "express"]), samples, config, dirs, "rnaseqcount-singlethread", max_multicore=1) as run_parallel: with profile.report("estimate expression (single threaded)", dirs): samples = rnaseq.quantitate_expression_noparallel( samples, run_parallel) samples = rnaseq.combine_files(samples) with prun.start(_wres(parallel, ["gatk", "vardict"]), samples, config, dirs, "rnaseq-variation") as run_parallel: with profile.report("RNA-seq variant calling", dirs): samples = rnaseq.rnaseq_variant_calling(samples, run_parallel) with prun.start( _wres( parallel, ["samtools", "fastqc", "qualimap", "kraken", "gatk", "preseq"], ensure_mem={"qualimap": 4}), samples, config, dirs, "qc") as run_parallel: with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) with profile.report("upload", dirs): samples = run_parallel("upload_samples", samples) for sample in samples: run_parallel("upload_samples_project", [sample]) with profile.report("bcbioRNAseq loading", dirs): tools_on = dd.get_in_samples(samples, dd.get_tools_on) bcbiornaseq_on = tools_on and "bcbiornaseq" in tools_on if bcbiornaseq_on and len(samples) == 1: logger.warn( "bcbioRNASeq does not work with just one sample, skipping." ) else: run_parallel("run_bcbiornaseqload", [sample]) logger.info("Timing: finished") return samples
parser.add_argument("-q", "--queue", help="Queue to submit jobs to.") parser.add_argument("-t", "--paralleltype", choices=["local", "ipython"], default="local", help="Run with iptyhon") args = parser.parse_args() system_config = os.path.join(_get_data_dir(), "galaxy", "bcbio_system.yaml") with open(system_config) as in_handle: config = yaml.load(in_handle) config["algorithm"] = {"num_cores": 1} samples = _get_samples_to_process(args.csv) prepped = [] if args.paralleltype == "ipython": logger.info("Starting IPython cluster. This may take a while.") with get_cluster_view(args) as view: logger.info("IPython cluster is up.") for sample, info in samples.iteritems(): prepped.append( view.apply_async(info['fn'], info["files"], os.path.join(args.out, info["out_file"]), config)) prepped = wait_until_complete(prepped) else: for sample, info in samples.iteritems(): logger.info("Merging sample: %s" % sample) prepped.append(info['fn'](info["files"], os.path.join(args.out, info["out_file"]), config)) create_new_csv(prepped, samples, args)
def process_alignment(data, alt_input=None): """Do an alignment of fastq files, preparing a sorted BAM output file. """ data = cwlutils.normalize_missing(utils.to_single_data(data)) data = cwlutils.unpack_tarballs(data, data) fastq1, fastq2 = dd.get_input_sequence_files(data) if alt_input: fastq1, fastq2 = alt_input config = data["config"] aligner = config["algorithm"].get("aligner", None) if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner: if dd.get_umi_type(data) == "dragen": assert bam.is_bam( fastq1), f"umi_type: dragen needs a BAM file as input." data = dragen.fix_umi_dragen_bam(data, bam=fastq1) # fastq1 = bam.sort(fastq1, dd.get_config(data)) # bam.index(fastq1, dd.get_config(data)) # data["work_bam"] = fastq1 else: logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner)) data = align_to_sort_bam(fastq1, fastq2, aligner, data) if dd.get_correct_umis(data): data["work_bam"] = postalign.correct_umis(data) if dd.get_umi_consensus(data): data["umi_bam"] = dd.get_work_bam(data) if fastq2 or dd.get_umi_type(data) == "dragen": f1, f2, avg_cov = postalign.umi_consensus(data) data["config"]["algorithm"]["rawumi_avg_cov"] = avg_cov del data["config"]["algorithm"]["umi_type"] data["config"]["algorithm"]["mark_duplicates"] = False data = align_to_sort_bam(f1, f2, aligner, data) else: raise ValueError( "Single fastq input for UMI processing; fgbio needs paired reads: %s" % dd.get_sample_name(data)) data = _add_supplemental_bams(data) elif fastq1 and objectstore.file_exists_or_remote( fastq1) and fastq1.endswith(".bam"): sort_method = config["algorithm"].get("bam_sort") bamclean = config["algorithm"].get("bam_clean") if bamclean is True or bamclean == "picard": if sort_method and sort_method != "coordinate": raise ValueError( "Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s" % sort_method) ref_file = dd.get_ref_file(data) out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], ref_file, data["dirs"], data) elif bamclean == "fixrg": out_bam = cleanbam.fixrg(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"], data) elif bamclean == "remove_extracontigs": out_bam = cleanbam.remove_extracontigs(fastq1, data) elif sort_method: runner = broad.runner_from_path("picard", config) out_file = os.path.join( data["dirs"]["work"], "{}-sort.bam".format( os.path.splitext(os.path.basename(fastq1))[0])) if not utils.file_exists(out_file): work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "bamclean", dd.get_sample_name(data))) out_file = os.path.join( work_dir, "{}-sort.bam".format(dd.get_sample_name(data))) out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file) else: out_bam = _link_bam_file( fastq1, os.path.join(dd.get_work_dir(data), "prealign", dd.get_sample_name(data)), data) bam.index(out_bam, data["config"]) bam.check_header(out_bam, data["rgnames"], dd.get_ref_file(data), data["config"]) dedup_bam = postalign.dedup_bam(out_bam, data) bam.index(dedup_bam, data["config"]) data["work_bam"] = dedup_bam elif fastq1 and objectstore.file_exists_or_remote( fastq1) and fastq1.endswith(".cram"): data["work_bam"] = fastq1 elif fastq1 is None and not dd.get_aligner(data): data["config"]["algorithm"]["variantcaller"] = False data["work_bam"] = None elif not fastq1: raise ValueError("No 'files' specified for input sample: %s" % dd.get_sample_name(data)) elif "kraken" in config["algorithm"]: # kraken doesn's need bam pass else: raise ValueError( "Could not process input file from sample configuration. \n" + fastq1 + "\nIs the path to the file correct or is empty?\n" + "If it is a fastq file (not pre-aligned BAM or CRAM), " "is an aligner specified in the input configuration?") if data.get("work_bam"): # Add stable 'align_bam' target to use for retrieving raw alignment data["align_bam"] = data["work_bam"] data = _add_hla_files(data) return [[data]]
def run(items, config): """Run third party disambiguation script, resolving into single set of calls. """ assert len(items) == 2, "Can only resolve two organism disambiguation" # check aligner, handling tophat/tophat2 distinctions aligner = config["algorithm"].get("aligner") if items[0]["disambiguate"].get("base"): data_a, data_b = items else: data_b, data_a = items # Construct name of sorted input files work_bam_a_nsorted = os.path.splitext( data_a["work_bam"])[0] + '.nsorted.bam' work_bam_b_nsorted = os.path.splitext( data_b["work_bam"])[0] + '.nsorted.bam' # logger.info('Disambiguate prep of input BAM {} and {}'.format(work_bam_a_nsorted, work_bam_b_nsorted)) if data_a.get("align_split"): base_dir = utils.safe_makedir( os.path.normpath( os.path.join(os.path.dirname(work_bam_a_nsorted), os.pardir, os.pardir, "disambiguate_%s" % aligner))) logger.info( 'Disambiguate prep of prepped work bam BAM {} with base dir {}'. format(work_bam_a_nsorted, base_dir)) split_name = "_".join( [str(x) for x in data_a["align_split"].split("-")]) out_dir = os.path.join(base_dir, split_name) logger.info( 'Disambiguate prep of prepped work bam BAM {} with out dir {}'. format(work_bam_a_nsorted, out_dir)) else: out_dir = os.path.normpath( os.path.join(os.path.dirname(work_bam_a_nsorted), os.pardir, "disambiguate_%s" % aligner)) base_name = os.path.join( out_dir, os.path.splitext(os.path.basename(work_bam_a_nsorted))[0]) logger.info( 'Disambiguate prep of prepped work bam BAM {} with base name {}'. format(work_bam_a_nsorted, base_name)) summary_file = "%s_summary.txt" % base_name explant_bam = "%s.explant.sorted.bam" % base_name ambiguous_bam = "%s.ambiguous.sorted.bam" % base_name work_bam = "%s.human.sorted.bam" % base_name logger.info('Disambiguate prep with work bam {}'.format(work_bam)) logger.info( 'Deciding if disambiguation is required. Checking for existence of {}, {}, {} and {}' .format(summary_file, explant_bam, ambiguous_bam, work_bam)) if not utils.file_exists(summary_file) or not utils.file_exists( explant_bam) or not utils.file_exists( ambiguous_bam) or not utils.file_exists(work_bam): logger.info( 'Disambiguating work bam a {} since outputs are not already existing' .format(work_bam_a_nsorted)) work_bam_a = bam.sort(data_a["work_bam"], config, "queryname") work_bam_b = bam.sort(data_b["work_bam"], config, "queryname") logger.info('Disambiguate run with work bam a {}'.format(work_bam_a)) logger.info('Disambiguate run with work bam b {}'.format(work_bam_b)) with file_transaction(items[0], out_dir) as tx_out_dir: logger.info( 'Disambiguate run with sorted prep work bam a {} and tx out dir {}' .format(work_bam_a_nsorted, tx_out_dir)) tmp_base_name = os.path.join(tx_out_dir, os.path.basename(base_name)) logger.info( 'Disambiguate run with sorted prep work bam a {} and tmp_base_name {}' .format(work_bam_a_nsorted, tmp_base_name)) pdx_filter = PDXFilter( work_bam_a, work_bam_b, "%s.human.bam" % tmp_base_name, # Must be bam else it will not be merged "%s.explant.bam" % tmp_base_name, # Must be bam else it will not be merged "%s.ambiguous.bam" % tmp_base_name, # Must be bam else it will not be merged "%s_summary.txt" % tmp_base_name, hard_filter=True, debug=True) pdx_filter.run() # Perhaps this can be removed since it has been fixed in bcbio if data_a.get("align_split"): split_dir = os.path.join(out_dir, split_name) logger.info( 'Disambiguate post-run with sorted prep work bam a {} and split dir {}' .format(work_bam_a_nsorted, split_dir)) if os.path.isdir(split_dir): for tmp_file in os.listdir(split_dir): logger.info( 'Disambiguate post-run with sorted prep work bam a {} aiming to move file {}' .format(work_bam_a_nsorted, tmp_file)) src = os.path.join(split_dir, tmp_file) if os.path.isfile(src): dest = os.path.join(out_dir, tmp_file) logger.info( 'Disambiguate post-run with sorted prep work bam a {} moving file {} from {} to {}' .format(work_bam_a_nsorted, tmp_file, src, dest)) shutil.move(src, dest) shutil.rmtree(split_dir) try: if work_bam_a != data_a["work_bam"]: os.remove(work_bam_a) except: pass try: if work_bam_b != data_b["work_bam"]: os.remove(work_bam_b) except: pass else: logger.info( 'Skipping disambiguation for work bam a {} since outputs are already existing' .format(work_bam_a_nsorted)) explant_bam = os.path.isfile(explant_bam) and explant_bam or bam.sort( "%s.explant.bam" % base_name, config) ambiguous_bam = os.path.isfile( ambiguous_bam) and ambiguous_bam or bam.sort( "%s.ambiguous.bam" % base_name, config) work_bam = os.path.isfile(work_bam) and work_bam or bam.sort( "%s.human.bam" % base_name, config) # logger.info('Disambiguate run with post work_bam {}'.format(work_bam)) data_a["disambiguate"] = { data_b["genome_build"]: explant_bam, "%s-ambiguous" % data_a["genome_build"]: ambiguous_bam, "summary": summary_file } data_a["work_bam"] = work_bam try: os.remove("%s.explant.bam" % base_name) except: pass try: os.remove("%s.human.bam" % base_name) except: pass try: os.remove("%s.ambiguous.bam" % base_name) except: pass return [[data_a]]
def combine_calls(*args): """Combine multiple callsets into a final set of merged calls. """ if len(args) == 3: is_cwl = False batch_id, samples, data = args caller_names, vrn_files = _organize_variants(samples, batch_id) else: is_cwl = True samples = [utils.to_single_data(x) for x in args] samples = [cwlutils.unpack_tarballs(x, x) for x in samples] data = samples[0] batch_id = data["batch_id"] caller_names = data["variants"]["variantcallers"] vrn_files = data["variants"]["calls"] logger.info("Ensemble consensus calls for {0}: {1}".format( batch_id, ",".join(caller_names))) edata = copy.deepcopy(data) base_dir = utils.safe_makedir( os.path.join(edata["dirs"]["work"], "ensemble", batch_id)) if any([vcfutils.vcf_has_variants(f) for f in vrn_files]): # Decompose multiallelic variants and normalize passonly = not tz.get_in( ["config", "algorithm", "ensemble", "use_filtered"], edata, False) vrn_files = [ normalize.normalize(f, data, passonly=passonly, rerun_effects=False, remove_oldeffects=True, nonrefonly=True, work_dir=utils.safe_makedir( os.path.join(base_dir, c))) for c, f in zip(caller_names, vrn_files) ] if "classifiers" not in (dd.get_ensemble(edata) or {}): callinfo = _run_ensemble_intersection(batch_id, vrn_files, caller_names, base_dir, edata) else: config_file = _write_config_file(batch_id, caller_names, base_dir, edata) callinfo = _run_ensemble(batch_id, vrn_files, config_file, base_dir, dd.get_ref_file(edata), edata) callinfo["vrn_file"] = vcfutils.bgzip_and_index( callinfo["vrn_file"], data["config"]) # After decomposing multiallelic variants and normalizing, re-evaluate effects ann_ma_file, _ = effects.add_to_vcf(callinfo["vrn_file"], data) if ann_ma_file: callinfo["vrn_file"] = ann_ma_file edata["config"]["algorithm"]["variantcaller"] = "ensemble" edata["vrn_file"] = callinfo["vrn_file"] edata["ensemble_bed"] = callinfo["bed_file"] callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get( "validate") else: out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id)) vcfutils.write_empty_vcf( out_vcf_file, samples=[dd.get_sample_name(d) for d in samples]) callinfo = { "variantcaller": "ensemble", "vrn_file": vcfutils.bgzip_and_index(out_vcf_file, data["config"]), "bed_file": None } if is_cwl: callinfo["batch_samples"] = data["batch_samples"] callinfo["batch_id"] = batch_id return [{"ensemble": callinfo}] else: return [[batch_id, callinfo]]
def _get_preseq_params(data, preseq_cmd, read_count): """ Get parameters through resources. If "step" or "extrap" limit are not provided, then calculate optimal values based on read count. """ defaults = { 'seg_len': 100000, # maximum segment length when merging paired end bam reads 'steps': 300, # number of points on the plot 'extrap_fraction': 3, # extrapolate up to X times read_count 'extrap': None, # extrapolate up to X reads 'step': None, # step size (number of reads between points on the plot) 'options': '', } params = {} main_opts = [("-e", "-extrap"), ("-l", "-seg_len"), ("-s", "-step")] other_opts = config_utils.get_resources("preseq", data["config"]).get("options", []) if isinstance(other_opts, str): other_opts = [other_opts] for sht, lng in main_opts: if sht in other_opts: i = other_opts.index(sht) elif lng in other_opts: i = other_opts.index(lng) else: i = None if i is not None: params[lng[1:]] = other_opts[i + 1] other_opts = other_opts[:i] + other_opts[i + 2:] params['options'] = ' '.join(other_opts) for k, v in config_utils.get_resources("preseq", data["config"]).items(): if k != 'options': params[k] = v params['steps'] = params.get('steps', defaults['steps']) if preseq_cmd == 'c_curve': params['extrap_fraction'] = 1 else: if params.get('step') is None: if params.get('extrap') is None: unrounded__extrap = read_count * params.get( 'extrap_fraction', defaults['extrap_fraction']) unrounded__step = unrounded__extrap // params['steps'] if params.get( 'extrap_fraction' ) is not None: # extrap_fraction explicitly provided params['extrap'] = unrounded__extrap params['step'] = unrounded__step else: power_of_10 = 10**math.floor(math.log(unrounded__step, 10)) rounded__step = int( math.floor(unrounded__step // power_of_10) * power_of_10) rounded__extrap = int(rounded__step) * params['steps'] params['step'] = rounded__step params['extrap'] = rounded__extrap else: params['step'] = params['extrap'] // params['steps'] elif params.get('extrap') is None: params['extrap'] = params['step'] * params['steps'] params['step'] = params.get('step', defaults['step']) params['extrap'] = params.get('extrap', defaults['extrap']) params['seg_len'] = params.get('seg_len', defaults['seg_len']) logger.info( "Preseq: running {steps} steps of size {step}, extap limit {extrap}". format(**params)) return params
def run(self, config, config_file, run_parallel, parallel, dirs, lane_items): ## Alignment and preparation requiring the entire input file (multicore cluster) with global_parallel(parallel, "multicore", ["align_prep_full"], lane_items, dirs["work"], config) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: alignment") samples = run_parallel( "align_prep_full", [list(x) + [config_file] for x in lane_items]) regions = callable.combine_sample_regions(samples) samples = region.add_region_info(samples, regions) samples = region.clean_sample_data(samples) logger.info("Timing: coverage") samples = coverage.summarize_samples(samples, run_parallel) ## Variant calling on sub-regions of the input file (full cluster) with global_parallel(parallel, "full", ["piped_bamprep", "variantcall_sample"], samples, dirs["work"], config, multiplier=len(regions["analysis"])) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: alignment post-processing") samples = region.parallel_prep_region(samples, regions, run_parallel) logger.info("Timing: variant calling") samples = region.parallel_variantcall_region(samples, run_parallel) ## Finalize variants (per-sample cluster) with global_parallel(parallel, "persample", ["postprocess_variants"], samples, dirs["work"], config) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: variant post-processing") samples = run_parallel("postprocess_variants", samples) samples = combine_multiple_callers(samples) logger.info("Timing: ensemble calling") samples = ensemble.combine_calls_parallel(samples, run_parallel) logger.info("Timing: prepped BAM merging") samples = region.delayed_bamprep_merge(samples, run_parallel) logger.info("Timing: validation") samples = run_parallel("compare_to_rm", samples) samples = validate.summarize_grading(samples) logger.info("Timing: population database") samples = population.prep_db_parallel(samples, run_parallel) logger.info("Timing: quality control") samples = qcsummary.generate_parallel(samples, run_parallel) logger.info("Timing: finished") return samples
def run(self, config, config_file, run_parallel, parallel, dirs, samples): ## Alignment and preparation requiring the entire input file (multicore cluster) with global_parallel(parallel, "multicore", ["process_alignment", "postprocess_alignment"], samples, dirs, config, multiplier=alignprep.parallel_multiplier(samples)) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: alignment") samples = run_parallel("prep_align_inputs", samples) samples = run_parallel("process_alignment", samples) samples = alignprep.merge_split_alignments(samples, run_parallel) samples = run_parallel("postprocess_alignment", samples) regions = callable.combine_sample_regions(samples) samples = region.add_region_info(samples, regions) samples = region.clean_sample_data(samples) logger.info("Timing: coverage") samples = coverage.summarize_samples(samples, run_parallel) ## Variant calling on sub-regions of the input file (full cluster) with global_parallel(parallel, "full", ["piped_bamprep", "variantcall_sample"], samples, dirs, config, multiplier=len(regions["analysis"]), max_multicore=1) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: alignment post-processing") samples = region.parallel_prep_region(samples, regions, run_parallel) logger.info("Timing: variant calling") samples = region.parallel_variantcall_region(samples, run_parallel) ## Finalize variants (per-sample cluster) with global_parallel(parallel, "persample", ["postprocess_variants"], samples, dirs, config) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: variant post-processing") samples = run_parallel("postprocess_variants", samples) logger.info("Timing: validation") samples = run_parallel("compare_to_rm", samples) samples = combine_multiple_callers(samples) logger.info("Timing: ensemble calling") samples = ensemble.combine_calls_parallel(samples, run_parallel) samples = validate.summarize_grading(samples) ## Finalizing BAMs and population databases, handle multicore computation with global_parallel(parallel, "multicore2", ["prep_gemini_db", "delayed_bam_merge"], samples, dirs, config) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: prepped BAM merging") samples = region.delayed_bamprep_merge(samples, run_parallel) logger.info("Timing: structural variation") samples = structural.run(samples, run_parallel) logger.info("Timing: population database") samples = population.prep_db_parallel(samples, run_parallel) logger.info("Timing: quality control") samples = qcsummary.generate_parallel(samples, run_parallel) logger.info("Timing: finished") return samples
def postprocess_variants(items): """Provide post-processing of variant calls: filtering and effects annotation. """ data = _get_batch_representative(items, "vrn_file") cur_name = "%s, %s" % (dd.get_sample_name(data), get_variantcaller(data)) logger.info("Finalizing variant calls: %s" % cur_name) orig_vrn_file = data.get("vrn_file") data = _symlink_to_workdir(data, ["vrn_file"]) data = _symlink_to_workdir(data, ["config", "algorithm", "variant_regions"]) if data.get("align_bam") and data.get("vrn_file"): logger.info("Calculating variation effects for %s" % cur_name) ann_vrn_file, vrn_stats = effects.add_to_vcf(data["vrn_file"], data) if ann_vrn_file: data["vrn_file"] = ann_vrn_file if vrn_stats: data["vrn_stats"] = vrn_stats orig_items = _get_orig_items(items) logger.info("Annotate VCF file: %s" % cur_name) data["vrn_file"] = annotation.finalize_vcf(data["vrn_file"], get_variantcaller(data), orig_items) logger.info("Filtering for %s" % cur_name) data["vrn_file"] = variant_filtration( data["vrn_file"], dd.get_ref_file(data), tz.get_in(("genome_resources", "variation"), data, {}), data, orig_items) logger.info("Prioritization for %s" % cur_name) data["vrn_file"] = prioritize.handle_vcf_calls(data["vrn_file"], data, orig_items) logger.info("Germline extraction for %s" % cur_name) data = germline.extract(data, orig_items) data = damage.run_filter(data["vrn_file"], dd.get_align_bam(data), dd.get_ref_file(data), data, orig_items) if orig_vrn_file and os.path.samefile(data["vrn_file"], orig_vrn_file): data["vrn_file"] = orig_vrn_file return [[data]]
def run(bam_file, data, fastqc_out): """Run fastqc, generating report in specified directory and parsing metrics. Downsamples to 10 million reads to avoid excessive processing times with large files, unless we're running a Standard/smallRNA-seq/QC pipeline. Handles fastqc 0.11+, which use a single HTML file and older versions that use a directory of files + images. The goal is to eventually move to only 0.11+ """ sentry_file = os.path.join(fastqc_out, "fastqc_report.html") if not os.path.exists(sentry_file): work_dir = os.path.dirname(fastqc_out) utils.safe_makedir(work_dir) ds_file = (bam.downsample(bam_file, data, 1e7, work_dir=work_dir) if data.get("analysis", "").lower() not in ["standard", "smallrna-seq"] else None) if ds_file is not None: bam_file = ds_file frmt = "bam" if bam_file.endswith("bam") else "fastq" fastqc_name = utils.splitext_plus(os.path.basename(bam_file))[0] fastqc_clean_name = dd.get_sample_name(data) num_cores = data["config"]["algorithm"].get("num_cores", 1) with tx_tmpdir(data, work_dir) as tx_tmp_dir: with utils.chdir(tx_tmp_dir): cl = [ config_utils.get_program("fastqc", data["config"]), "-d", tx_tmp_dir, "-t", str(num_cores), "--extract", "-o", tx_tmp_dir, "-f", frmt, bam_file ] cl = "%s %s" % (utils.local_path_export(), " ".join( [str(x) for x in cl])) do.run(cl, "FastQC: %s" % dd.get_sample_name(data)) tx_fastqc_out = os.path.join(tx_tmp_dir, "%s_fastqc" % fastqc_name) tx_combo_file = os.path.join(tx_tmp_dir, "%s_fastqc.html" % fastqc_name) if not os.path.exists(sentry_file) and os.path.exists( tx_combo_file): utils.safe_makedir(fastqc_out) # Use sample name for reports instead of bam file name with open(os.path.join(tx_fastqc_out, "fastqc_data.txt"), 'r') as fastqc_bam_name, \ open(os.path.join(tx_fastqc_out, "_fastqc_data.txt"), 'w') as fastqc_sample_name: for line in fastqc_bam_name: fastqc_sample_name.write( line.replace(os.path.basename(bam_file), fastqc_clean_name)) shutil.move( os.path.join(tx_fastqc_out, "_fastqc_data.txt"), os.path.join(fastqc_out, 'fastqc_data.txt')) shutil.move(tx_combo_file, sentry_file) if os.path.exists("%s.zip" % tx_fastqc_out): shutil.move( "%s.zip" % tx_fastqc_out, os.path.join(fastqc_out, "%s.zip" % fastqc_clean_name)) elif not os.path.exists(sentry_file): raise ValueError( "FastQC failed to produce output HTML file: %s" % os.listdir(tx_tmp_dir)) logger.info("Produced HTML report %s" % sentry_file) parser = FastQCParser(fastqc_out, dd.get_sample_name(data)) stats = parser.get_fastqc_summary() parser.save_sections_into_file() return stats
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) results_file = os.path.join(results_dir, "genome_results.txt") report_file = os.path.join(results_dir, "qualimapReport.html") utils.safe_makedir(results_dir) pdf_file = "qualimapReport.pdf" if not utils.file_exists(results_file) and not utils.file_exists( os.path.join(results_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) with file_transaction(data, results_dir) as tx_results_dir: utils.safe_makedir(tx_results_dir) export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % ( utils.java_freetype_fix(), utils.local_path_export(), max_mem, tx_results_dir) cmd = ( "unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {tx_results_dir} " "--skip-duplicated --skip-dup-mode 0 " "-nt {num_cores} {options}") species = None if (tz.get_in(("genome_resources", "aliases", "human"), data, "") or dd.get_genome_build(data).startswith(("hg", "GRCh"))): species = "HUMAN" elif dd.get_genome_build(data).startswith(("mm", "GRCm")): species = "MOUSE" if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = (dd.get_coverage(data) if dd.get_coverage(data) not in [ None, False, "None" ] else dd.get_variant_regions_merged(data)) if regions: regions = bedutils.merge_overlaps( bedutils.clean_file(regions, data), data) bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" bcbio_env = utils.get_bcbio_env() do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data), env=bcbio_env) tx_results_file = os.path.join(tx_results_dir, "genome_results.txt") cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % ( dd.get_sample_name(data), tx_results_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order # to keep its name after upload, we need to put the base QC file (results_file) into the root directory (out_dir): base_results_file = os.path.join(out_dir, os.path.basename(results_file)) shutil.copyfile(results_file, base_results_file) return { "base": base_results_file, "secondary": _find_qualimap_secondary_files(results_dir, base_results_file) }
def _memoized_message(self, in_file, out_file): logger.info("%s already run on %s and stored as %s, skipping." % (self.stage, in_file, out_file))