def get_contigs(data): contigs = [x.name for x in shared.get_noalt_contigs(data)] keep = [ x for x in contigs if chromhacks.is_autosomal(x) or chromhacks.is_sex(x) ] return keep
def depth_freq_filter(line, tumor_index, aligner): """Command line to filter VarDict calls based on depth, frequency and quality. Looks at regions with low depth for allele frequency (AF * DP < 6, the equivalent of < 13bp for heterogygote calls, but generalized. Within these calls filters if a calls has: - Low mapping quality and multiple mismatches in a read (NM) For bwa only: MQ < 55.0 and NM > 1.0 or MQ < 60.0 and NM > 2.0 - Low depth (DP < 10) - Low QUAL (QUAL < 45) Also filters in low allele frequency regions with poor quality, if all of these are true: - Allele frequency < 0.2 - Quality < 55 - P-value (SSF) > 0.06 """ if line.startswith("#CHROM"): headers = [('##FILTER=<ID=LowAlleleDepth,Description="Low depth per allele frequency ' 'along with poor depth, quality, mapping quality and read mismatches.">'), ('##FILTER=<ID=LowFreqQuality,Description="Low frequency read with ' 'poor quality and p-value (SSF).">')] return "\n".join(headers) + "\n" + line elif line.startswith("#"): return line else: parts = line.split("\t") sample_ft = {a: v for (a, v) in zip(parts[8].split(":"), parts[9 + tumor_index].split(":"))} qual = utils.safe_to_float(parts[5]) dp = utils.safe_to_float(sample_ft.get("DP")) af = utils.safe_to_float(sample_ft.get("AF")) nm = utils.safe_to_float(sample_ft.get("NM")) mq = utils.safe_to_float(sample_ft.get("MQ")) ssfs = [x for x in parts[7].split(";") if x.startswith("SSF=")] pval = utils.safe_to_float(ssfs[0].split("=")[-1] if ssfs else None) fname = None if not chromhacks.is_sex(parts[0]) and dp is not None and af is not None: if dp * af < 6: if aligner == "bwa" and nm is not None and mq is not None: if (mq < 55.0 and nm > 1.0) or (mq < 60.0 and nm > 2.0): fname = "LowAlleleDepth" if dp < 10: fname = "LowAlleleDepth" if qual is not None and qual < 45: fname = "LowAlleleDepth" if af is not None and qual is not None and pval is not None: if af < 0.2 and qual < 45 and pval > 0.06: fname = "LowFreqQuality" if fname: if parts[6] in set([".", "PASS"]): parts[6] = fname else: parts[6] += ";%s" % fname line = "\t".join(parts) return line
def _goleft_indexcov(bam_file, data, out_dir): """Use goleft indexcov to estimate coverage distributions using BAM index. Only used for whole genome runs as captures typically don't have enough data to be useful for index-only summaries. """ if not dd.get_coverage_interval(data) == "genome": return [] out_dir = utils.safe_makedir(os.path.join(out_dir, "indexcov")) out_files = [ os.path.join(out_dir, "%s-indexcov.%s" % (dd.get_sample_name(data), ext)) for ext in ["roc", "ped", "bed.gz"] ] if not utils.file_uptodate(out_files[-1], bam_file): with transaction.tx_tmpdir(data) as tmp_dir: tmp_dir = utils.safe_makedir( os.path.join(tmp_dir, dd.get_sample_name(data))) gender_chroms = [ x.name for x in ref.file_contigs(dd.get_ref_file(data)) if chromhacks.is_sex(x.name) ] gender_args = "--sex %s" % ( ",".join(gender_chroms)) if gender_chroms else "" # XXX Skip gender args until we can correctly specify #1793 gender_args = "" cmd = "goleft indexcov --directory {tmp_dir} {gender_args} -- {bam_file}" try: do.run(cmd.format(**locals()), "QC: goleft indexcov") except subprocess.CalledProcessError as msg: if not ("indexcov: no usable" in str(msg) or ("indexcov: expected" in str(msg) and "sex chromosomes, found:" in str(msg))): raise for out_file in out_files: orig_file = os.path.join(tmp_dir, os.path.basename(out_file)) if utils.file_exists(orig_file): utils.copy_plus(orig_file, out_file) # MultiQC needs non-gzipped/BED inputs so unpack the file out_bed = out_files[-1].replace(".bed.gz", ".tsv") if utils.file_exists(out_files[-1]) and not utils.file_exists(out_bed): with transaction.file_transaction(data, out_bed) as tx_out_bed: cmd = "gunzip -c %s > %s" % (out_files[-1], tx_out_bed) do.run(cmd, "Unpack indexcov BED file") out_files[-1] = out_bed return [x for x in out_files if utils.file_exists(x)]
def _goleft_indexcov(bam_file, data, out_dir): """Use goleft indexcov to estimate coverage distributions using BAM index. Only used for whole genome runs as captures typically don't have enough data to be useful for index-only summaries. """ if not dd.get_coverage_interval(data) == "genome": return [] out_dir = utils.safe_makedir(os.path.join(out_dir, "indexcov")) out_files = [os.path.join(out_dir, "%s-indexcov.%s" % (dd.get_sample_name(data), ext)) for ext in ["roc", "ped", "bed.gz"]] if not utils.file_uptodate(out_files[-1], bam_file): with transaction.tx_tmpdir(data) as tmp_dir: tmp_dir = utils.safe_makedir(os.path.join(tmp_dir, dd.get_sample_name(data))) gender_chroms = [x.name for x in ref.file_contigs(dd.get_ref_file(data)) if chromhacks.is_sex(x.name)] gender_args = "--sex %s" % (",".join(gender_chroms)) if gender_chroms else "" cmd = "goleft indexcov --directory {tmp_dir} {gender_args} -- {bam_file}" try: do.run(cmd.format(**locals()), "QC: goleft indexcov") except subprocess.CalledProcessError as msg: if not ("indexcov: no usable" in str(msg) or ("indexcov: expected" in str(msg) and "sex chromosomes, found:" in str(msg))): raise for out_file in out_files: orig_file = os.path.join(tmp_dir, os.path.basename(out_file)) if utils.file_exists(orig_file): utils.copy_plus(orig_file, out_file) # MultiQC needs non-gzipped/BED inputs so unpack the file out_bed = out_files[-1].replace(".bed.gz", ".tsv") if utils.file_exists(out_files[-1]) and not utils.file_exists(out_bed): with transaction.file_transaction(data, out_bed) as tx_out_bed: cmd = "gunzip -c %s > %s" % (out_files[-1], tx_out_bed) do.run(cmd, "Unpack indexcov BED file") out_files[-1] = out_bed return [x for x in out_files if utils.file_exists(x)]