Exemplo n.º 1
0
def run(bam_file, data, out_dir):
    """Run viral QC analysis.
    """
    viral_target = "gdc-viral"
    out = {}
    if vcfutils.get_paired_phenotype(data):
        viral_refs = [x for x in dd.get_viral_files(data) if os.path.basename(x) == "%s.fa" % viral_target]
        if viral_refs and utils.file_exists(viral_refs[0]):
            viral_ref = viral_refs[0]
            viral_bam = os.path.join(utils.safe_makedir(out_dir),
                                     "%s-%s.bam" % (dd.get_sample_name(data),
                                                    utils.splitext_plus(os.path.basename(viral_ref))[0]))
            out_file = "%s-counts.txt" % utils.splitext_plus(viral_bam)[0]
            if not utils.file_uptodate(out_file, bam_file):
                if not utils.file_uptodate(viral_bam, bam_file):
                    with file_transaction(data, viral_bam) as tx_out_file:
                        cores = dd.get_num_cores(data)
                        tmpfile = "%s-tmp" % utils.splitext_plus(tx_out_file)[0]
                        cmd = ("samtools view -u -f 4 {bam_file} | "
                               "bamtofastq collate=0 | "
                               "bwa mem -t {cores} {viral_ref} - | "
                               "bamsort tmpfile={tmpfile} inputthreads={cores} outputthreads={cores} "
                               "inputformat=sam index=1 indexfilename={tx_out_file}.bai O={tx_out_file}")
                        do.run(cmd.format(**locals()), "Compare unmapped reads to viral genome")
                with file_transaction(data, out_file) as tx_out_file:
                    with open(tx_out_file, "w") as out_handle:
                        out_handle.write("# sample\t%s\n" % dd.get_sample_name(data))
                        for info in bam.idxstats(viral_bam, data):
                            if info.aligned > 0:
                                out_handle.write("%s\t%s\n" % (info.contig, info.aligned))
            out["base"] = out_file
    return out
Exemplo n.º 2
0
def run(_, data, out_dir):
    stats_file = os.path.join(utils.safe_makedir(out_dir), "%s_umi_stats.yaml" % dd.get_sample_name(data))
    if not utils.file_uptodate(stats_file, dd.get_align_bam(data)):
        out = {}
        total = 0
        mapped = 0
        duplicates = 0
        umi_reductions = []
        umi_counts = collections.defaultdict(int)
        with pysam.AlignmentFile(data["umi_bam"], "rb", check_sq=False) as bam_iter:
            cur_counts = collections.defaultdict(int)
            cur_key = None
            for rec in bam_iter:
                total += 1
                umi = _get_umi_tag(rec)
                if umi and not rec.is_unmapped:
                    mapped += 1
                    if rec.is_duplicate:
                        duplicates += 1
                    chrom = bam_iter.getrname(rec.reference_id)
                    pos = rec.reference_start
                    key = (chrom, pos)
                    if key != cur_key:
                        # update counts
                        if cur_counts:
                            for c in cur_counts.values():
                                umi_counts[c] += 1
                            total_seqs = sum(cur_counts.values())
                            umi_count = len(cur_counts)
                            umi_reductions.append(float(total_seqs) / umi_count)
                        # update current keys
                        cur_key = key
                        cur_counts = collections.defaultdict(int)
                    cur_counts[umi] += 1
            if cur_counts:
                for c in cur_counts.values():
                    umi_counts[c] += 1
                total_seqs = sum(cur_counts.values())
                umi_count = len(cur_counts)
                umi_reductions.append(float(total_seqs) / umi_count)
        consensus_count = sum([x.aligned for x in bam.idxstats(dd.get_align_bam(data), data)])
        out["umi_baseline_all"] = total
        out["umi_baseline_mapped"] = mapped
        out["umi_baseline_duplicate_pct"] = float(duplicates) / float(mapped) * 100.0
        out["umi_consensus_mapped"] = consensus_count
        out["umi_consensus_pct"] = (100.0 - float(consensus_count) / float(mapped) * 100.0)
        out["umi_reduction_median"] = int(math.ceil(np.median(umi_reductions)))
        out["umi_reduction_max"] = int(max(umi_reductions))
        out["umi_counts"] = dict(umi_counts)
        out["umi_raw_avg_cov"] = data["config"]["algorithm"].get("rawumi_avg_cov", 0)
        with open(stats_file, "w") as out_handle:
            yaml.safe_dump({dd.get_sample_name(data): out}, out_handle,
                           default_flow_style=False, allow_unicode=False)
    return stats_file
Exemplo n.º 3
0
def _average_genome_coverage(data, bam_file):
    """Quickly calculate average coverage for whole genome files using indices.

    Includes all reads, with duplicates. Uses sampling of 10M reads.
    """
    total = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])])
    read_counts = sum(x.aligned for x in bam.idxstats(bam_file, data))
    with pysam.Samfile(bam_file, "rb") as pysam_bam:
        read_size = np.median(list(itertools.islice((a.query_length for a in pysam_bam.fetch()), int(1e7))))
    avg_cov = float(read_counts * read_size) / total
    return avg_cov
Exemplo n.º 4
0
def _average_genome_coverage(data, bam_file):
    """Quickly calculate average coverage for whole genome files using indices.

    Includes all reads, with duplicates.
    """
    total = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])])
    read_counts = sum(x.aligned for x in bam.idxstats(bam_file, data))
    with pysam.Samfile(bam_file, "rb") as pysam_bam:
        read_size = np.median(list(itertools.islice((a.query_length for a in pysam_bam.fetch()), 1e5)))
    avg_cov = float(read_counts * read_size) / total
    return avg_cov
Exemplo n.º 5
0
def run(_, data, out_dir):
    stats_file = os.path.join(utils.safe_makedir(out_dir),
                              "%s_umi_stats.yaml" % dd.get_sample_name(data))
    if not utils.file_uptodate(stats_file, dd.get_align_bam(data)):
        out = {}
        counts = collections.defaultdict(lambda: collections.defaultdict(int))
        total = 0
        mapped = 0
        duplicates = 0
        with pysam.AlignmentFile(data["umi_bam"], "rb",
                                 check_sq=False) as bam_iter:
            for rec in bam_iter:
                total += 1
                umi = rec.get_tag("RX")
                if umi and not rec.is_unmapped:
                    mapped += 1
                    if rec.is_duplicate:
                        duplicates += 1
                    chrom = bam_iter.getrname(rec.reference_id)
                    pos = rec.reference_start
                    key = (chrom, pos)
                counts[key][umi] += 1
        umi_reductions = []
        umi_counts = collections.defaultdict(int)
        for key in sorted(counts.keys()):
            for c in counts[key].values():
                umi_counts[c] += 1
            total_seqs = sum(counts[key].values())
            umi_count = len(counts[key])
            umi_reductions.append(float(total_seqs) / umi_count)
        consensus_count = sum(
            [x.aligned for x in bam.idxstats(dd.get_align_bam(data), data)])
        out["umi_baseline_all"] = total
        out["umi_baseline_mapped"] = mapped
        out["umi_baseline_duplicate_pct"] = float(duplicates) / float(
            mapped) * 100.0
        out["umi_consensus_mapped"] = consensus_count
        out["umi_consensus_pct"] = (
            100.0 - float(consensus_count) / float(mapped) * 100.0)
        out["umi_reduction_median"] = int(math.ceil(np.median(umi_reductions)))
        out["umi_reduction_max"] = int(max(umi_reductions))
        out["umi_counts"] = dict(umi_counts)
        with open(stats_file, "w") as out_handle:
            yaml.safe_dump({dd.get_sample_name(data): out},
                           out_handle,
                           default_flow_style=False,
                           allow_unicode=False)
    return stats_file
Exemplo n.º 6
0
def run(bam_file, data, out_dir):
    """Run viral QC analysis.
    """
    viral_target = "gdc-viral"
    out = {}
    if vcfutils.get_paired_phenotype(data):
        viral_refs = [
            x for x in dd.get_viral_files(data)
            if os.path.basename(x) == "%s.fa" % viral_target
        ]
        if viral_refs and utils.file_exists(viral_refs[0]):
            viral_ref = viral_refs[0]
            viral_bam = os.path.join(
                utils.safe_makedir(out_dir), "%s-%s.bam" %
                (dd.get_sample_name(data),
                 utils.splitext_plus(os.path.basename(viral_ref))[0]))
            out_file = "%s-counts.txt" % utils.splitext_plus(viral_bam)[0]
            if not utils.file_uptodate(out_file, bam_file):
                if not utils.file_uptodate(viral_bam, bam_file):
                    with file_transaction(data, viral_bam) as tx_out_file:
                        cores = dd.get_num_cores(data)
                        tmpfile = "%s-tmp" % utils.splitext_plus(
                            tx_out_file)[0]
                        cmd = (
                            "samtools view -u -f 4 {bam_file} | "
                            "bamtofastq collate=0 | "
                            "bwa mem -t {cores} {viral_ref} - | "
                            "bamsort tmpfile={tmpfile} inputthreads={cores} outputthreads={cores} "
                            "inputformat=sam index=1 indexfilename={tx_out_file}.bai O={tx_out_file}"
                        )
                        do.run(cmd.format(**locals()),
                               "Compare unmapped reads to viral genome")
                with file_transaction(data, out_file) as tx_out_file:
                    with open(tx_out_file, "w") as out_handle:
                        out_handle.write("# sample\t%s\n" %
                                         dd.get_sample_name(data))
                        for info in bam.idxstats(viral_bam, data):
                            if info.aligned > 0:
                                out_handle.write("%s\t%s\n" %
                                                 (info.contig, info.aligned))
            out["base"] = out_file
    return out
Exemplo n.º 7
0
def run(_, data, out_dir):
    stats_file = os.path.join(utils.safe_makedir(out_dir),
                              "%s_umi_stats.yaml" % dd.get_sample_name(data))
    if not utils.file_uptodate(stats_file, dd.get_align_bam(data)):
        out = {}
        total = 0
        mapped = 0
        duplicates = 0
        umi_reductions = []
        umi_counts = collections.defaultdict(int)
        with pysam.AlignmentFile(data["umi_bam"], "rb",
                                 check_sq=False) as bam_iter:
            cur_counts = collections.defaultdict(int)
            cur_key = None
            for rec in bam_iter:
                total += 1
                umi = _get_umi_tag(rec)
                if umi and not rec.is_unmapped:
                    mapped += 1
                    if rec.is_duplicate:
                        duplicates += 1
                    chrom = bam_iter.getrname(rec.reference_id)
                    pos = rec.reference_start
                    key = (chrom, pos)
                    if key != cur_key:
                        # update counts
                        if cur_counts:
                            for c in cur_counts.values():
                                umi_counts[c] += 1
                            total_seqs = sum(cur_counts.values())
                            umi_count = len(cur_counts)
                            umi_reductions.append(
                                float(total_seqs) / umi_count)
                        # update current keys
                        cur_key = key
                        cur_counts = collections.defaultdict(int)
                    cur_counts[umi] += 1
            if cur_counts:
                for c in cur_counts.values():
                    umi_counts[c] += 1
                total_seqs = sum(cur_counts.values())
                umi_count = len(cur_counts)
                umi_reductions.append(float(total_seqs) / umi_count)
        consensus_count = sum(
            [x.aligned for x in bam.idxstats(dd.get_align_bam(data), data)])
        out["umi_baseline_all"] = total
        out["umi_baseline_mapped"] = mapped
        out["umi_baseline_duplicate_pct"] = float(duplicates) / float(
            mapped) * 100.0
        out["umi_consensus_mapped"] = consensus_count
        out["umi_consensus_pct"] = (
            100.0 - float(consensus_count) / float(mapped) * 100.0)
        out["umi_reduction_median"] = int(math.ceil(np.median(umi_reductions)))
        out["umi_reduction_max"] = int(max(umi_reductions))
        out["umi_counts"] = dict(umi_counts)
        out["umi_raw_avg_cov"] = data["config"]["algorithm"].get(
            "rawumi_avg_cov", 0)
        with open(stats_file, "w") as out_handle:
            yaml.safe_dump({dd.get_sample_name(data): out},
                           out_handle,
                           default_flow_style=False,
                           allow_unicode=False)
    return stats_file
Exemplo n.º 8
0
def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources,
        data):
    """
    Run macs2 for chip and input samples avoiding
    errors due to samples.
    """
    # output file name need to have the caller name
    config = dd.get_config(data)
    out_file = os.path.join(out_dir, name + "_peaks_macs2.xls")
    macs2_file = os.path.join(out_dir, name + "_peaks.xls")
    if utils.file_exists(out_file):
        _compress_and_sort_bdg_files(out_dir, data)
        return _get_output_files(out_dir)
    macs2 = config_utils.get_program("macs2", config)
    antibody = dd.get_antibody(data)
    if antibody:
        antibody = antibody.lower()
        if antibody not in antibodies.SUPPORTED_ANTIBODIES:
            logger.error(
                f"{antibody} specified, but not listed as a supported antibody. Valid antibodies are {antibodies.SUPPORTED_ANTIBODIES}. If you know your antibody "
                f"should be called with narrow or broad peaks, supply 'narrow' or 'broad' as the antibody."
                f"It will run 'narrow' if the antibody is not supported.")
            antibody = 'narrow'
        antibody = antibodies.ANTIBODIES[antibody]
        logger.info(
            f"{antibody.name} specified, using {antibody.peaktype} peak settings."
        )
        peaksettings = select_peak_parameters(antibody)
    elif method == "atac":
        logger.info(f"ATAC-seq specified, using narrow peak settings.")
        peaksettings = " "
    else:
        peaksettings = " "
    options = " ".join(resources.get("macs2", {}).get("options", ""))
    genome_size = bam.fasta.total_sequence_length(dd.get_ref_file(data))
    genome_size = "" if options.find("-g") > -1 else "-g %s" % genome_size
    paired = "-f BAMPE" if bam.is_paired(chip_bam) else ""
    chip_reads = sum([x.aligned for x in bam.idxstats(chip_bam, data)])
    if chip_reads == 0:
        logger.error(
            f"{chip_bam} has 0 reads. Please remove the sample and re-run")
        raise RuntimeWarning(
            f"macs2 terminated - no reads in {chip_bam}. Please remove the sample and re-run"
        )
    with utils.chdir(out_dir):
        cmd = _macs2_cmd(data)
        cmd += peaksettings
        try:
            do.run(cmd.format(**locals()), "macs2 for %s" % name)
            utils.move_safe(macs2_file, out_file)
        except subprocess.CalledProcessError:
            raise RuntimeWarning(
                "macs2 terminated with an error. "
                "Please, check the message and report "
                "error if it is related to bcbio. "
                "You can add specific options for the sample "
                "setting resources as explained in docs: "
                "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources"
            )
    _compress_and_sort_bdg_files(out_dir, data)
    return _get_output_files(out_dir)