Exemplo n.º 1
0
 def test_antitarget(self):
     """The 'antitarget' command."""
     baits = tabio.read_auto('formats/nv2_baits.interval_list')
     access = tabio.read_auto('../data/access-5k-mappable.hg19.bed')
     self.assertLess(0, len(commands.do_antitarget(baits)))
     self.assertLess(0, len(commands.do_antitarget(baits, access)))
     self.assertLess(0, len(commands.do_antitarget(baits, access, 200000)))
     self.assertLess(
         0, len(commands.do_antitarget(baits, access, 10000, 5000)))
Exemplo n.º 2
0
 def test_antitarget(self):
     """The 'antitarget' command."""
     baits = tabio.read_auto('formats/nv2_baits.interval_list')
     access = tabio.read_auto('../data/access-5k-mappable.hg19.bed')
     self.assertLess(0, len(commands.do_antitarget(baits)))
     self.assertLess(0, len(commands.do_antitarget(baits, access)))
     self.assertLess(0, len(commands.do_antitarget(baits, access, 200000)))
     self.assertLess(0, len(commands.do_antitarget(baits, access, 10000,
                                                   5000)))
Exemplo n.º 3
0
 def test_read_auto(self):
     for fname, nrows in (("formats/empty", 0),
                          ("formats/amplicon.bed", 1433),
                          ("formats/amplicon.text", 1433),
                          ("formats/nv2_baits.interval_list", 6809),
                          ("formats/refflat-mini.txt", 100),
                          ("formats/example.gff", 6),
                         ):
         self.assertEqual(len(tabio.read_auto(fname)), nrows)
         with open(fname) as handle:
             self.assertEqual(len(tabio.read_auto(handle)), nrows)
Exemplo n.º 4
0
 def test_target(self):
     """The 'target' command."""
     #return  # DBG
     annot_fname = "formats/refflat-mini.txt"
     for bait_fname in ("formats/nv2_baits.interval_list",
                        "formats/amplicon.bed",
                        "formats/baits-funky.bed"):
         baits = tabio.read_auto(bait_fname)
         bait_len = len(baits)
         # No splitting: w/o and w/ re-annotation
         r1 = commands.do_target(baits)
         self.assertEqual(len(r1), bait_len)
         r1a = commands.do_target(baits, do_short_names=True,
                                  annotate=annot_fname)
         self.assertEqual(len(r1a), len(r1))
         # Splitting, w/o and w/ re-annotation
         r2 = commands.do_target(baits, do_short_names=True, do_split=True,
                                 avg_size=100)
         self.assertGreater(len(r2), len(r1))
         for _c, subarr in r2.by_chromosome():
             self.assertTrue(subarr.start.is_monotonic_increasing, bait_fname)
             self.assertTrue(subarr.end.is_monotonic_increasing, bait_fname)
             # Bins are non-overlapping; next start >= prev. end
             self.assertTrue(
                     ((subarr.start.values[1:] - subarr.end.values[:-1])
                      >= 0).all())
         r2a = commands.do_target(baits, do_short_names=True, do_split=True,
                                  avg_size=100, annotate=annot_fname)
         self.assertEqual(len(r2a), len(r2))
         # Original regions object should be unmodified
         self.assertEqual(len(baits), bait_len)
Exemplo n.º 5
0
 def test_target(self):
     """The 'target' command."""
     annot_fname = "formats/refflat-mini.txt"
     for bait_fname in ("formats/nv2_baits.interval_list",
                        "formats/amplicon.bed"):
         baits = tabio.read_auto(bait_fname)
         bait_len = len(baits)
         # No splitting: w/ and w/o re-annotation
         r1 = commands.do_target(baits)
         self.assertEqual(len(r1), bait_len)
         r1a = commands.do_target(baits,
                                  do_short_names=True,
                                  annotate=annot_fname)
         self.assertEqual(len(r1a), len(r1))
         # Splitting
         r2 = commands.do_target(baits,
                                 do_short_names=True,
                                 do_split=True,
                                 avg_size=100)
         self.assertGreater(len(r2), len(r1))
         r2a = commands.do_target(baits,
                                  do_short_names=True,
                                  do_split=True,
                                  avg_size=100,
                                  annotate=annot_fname)
         self.assertEqual(len(r2a), len(r2))
         # Original regions object should be unmodified
         self.assertEqual(len(baits), bait_len)
Exemplo n.º 6
0
def bed2probes(bed_fname):
    """Create a neutral-coverage CopyNumArray from a file of regions."""
    regions = tabio.read_auto(bed_fname)
    table = regions.data.loc[:, ("chromosome", "start", "end")]
    table["gene"] = (regions.data["gene"] if "gene" in regions.data else '-')
    table["log2"] = 0.0
    table["spread"] = 0.0
    return CNA(table, {"sample_id": core.fbase(bed_fname)})
Exemplo n.º 7
0
def bed2probes(bed_fname):
    """Create a neutral-coverage CopyNumArray from a file of regions."""
    regions = tabio.read_auto(bed_fname)
    table = regions.data.loc[:, ("chromosome", "start", "end")]
    table["gene"] = (regions.data["gene"] if "gene" in regions.data else '-')
    table["log2"] = 0.0
    table["spread"] = 0.0
    return CNA(table, {"sample_id": core.fbase(bed_fname)})
Exemplo n.º 8
0
def filter_targets(target_bed, sample_bams, procs):
    """Check if each potential target has significant coverage."""
    baits = tabio.read_auto(target_bed)
    # Loop over BAMs to calculate weighted averages of bin coverage depths
    total_depths = np.zeros(len(baits), dtype=np.float_)
    for bam_fname in sample_bams:
        logging.info("Evaluating targets in %s", bam_fname)
        sample = cnvlib.do_coverage(target_bed, bam_fname, processes=procs)
        total_depths += sample['depth'].values
    baits['depth'] = total_depths / len(sample_bams)
    return baits
Exemplo n.º 9
0
 def test_total_range_size(self):
     """Test total region coverage calculation."""
     for fname, area in (
         ('formats/empty', 0),
         ('formats/my-targets.bed', 103),
         ('formats/dac-my.bed', 148),
         ('formats/example.gff', 7951),
         ('formats/refflat-mini.txt', 719715),
     ):
         regions = tabio.read_auto(fname)
         self.assertEqual(regions.total_range_size(), area)
Exemplo n.º 10
0
 def test_total_range_size(self):
     """Test total region coverage calculation."""
     for fname, area in (
         ('formats/empty', 0),
         ('formats/my-targets.bed', 103),
         ('formats/dac-my.bed', 148),
         ('formats/example.gff', 7951),
         ('formats/refflat-mini.txt', 719715),
     ):
         regions = tabio.read_auto(fname)
         self.assertEqual(regions.total_range_size(), area)
Exemplo n.º 11
0
def interval_coverages_count(bed_fname, bam_fname, min_mapq, procs=1):
    """Calculate log2 coverages in the BAM file at each interval."""
    regions = tabio.read_auto(bed_fname)
    if procs == 1:
        bamfile = pysam.Samfile(bam_fname, 'rb')
        for chrom, subregions in regions.by_chromosome():
            logging.info("Processing chromosome %s of %s",
                         chrom, os.path.basename(bam_fname))
            for count, row in _rdc_chunk(bamfile, subregions, min_mapq):
                yield [count, row]
    else:
        with futures.ProcessPoolExecutor(procs) as pool:
            args_iter = ((bam_fname, subr, min_mapq)
                         for _c, subr in regions.by_chromosome())
            for chunk in pool.map(_rdc, args_iter):
                for count, row in chunk:
                    yield [count, row]
Exemplo n.º 12
0
def interval_coverages_count(bed_fname, bam_fname, min_mapq, procs=1):
    """Calculate log2 coverages in the BAM file at each interval."""
    regions = tabio.read_auto(bed_fname)
    if procs == 1:
        bamfile = pysam.Samfile(bam_fname, 'rb')
        for chrom, subregions in regions.by_chromosome():
            logging.info("Processing chromosome %s of %s", chrom,
                         os.path.basename(bam_fname))
            for count, row in _rdc_chunk(bamfile, subregions, min_mapq):
                yield [count, row]
    else:
        with futures.ProcessPoolExecutor(procs) as pool:
            args_iter = ((bam_fname, subr, min_mapq)
                         for _c, subr in regions.by_chromosome())
            for chunk in pool.map(_rdc, args_iter):
                for count, row in chunk:
                    yield [count, row]
Exemplo n.º 13
0
def do_target(bait_arr, annotate=None, do_short_names=False, do_split=False,
              avg_size=200/.75):
    """Transform bait intervals into targets more suitable for CNVkit."""
    tgt_arr = bait_arr.copy()
    # Drop zero-width regions
    tgt_arr = tgt_arr[tgt_arr.start != tgt_arr.end]
    if do_split:
        logging.info("Splitting large targets")
        tgt_arr = tgt_arr.subdivide(avg_size, 0)
    if annotate:
        logging.info("Applying annotations as target names")
        annotation = tabio.read_auto(annotate)
        antitarget.compare_chrom_names(tgt_arr, annotation)
        tgt_arr['gene'] = annotation.into_ranges(tgt_arr, 'gene', '-')
    if do_short_names:
        logging.info("Shortening target interval labels")
        tgt_arr['gene'] = list(shorten_labels(tgt_arr['gene']))
    return tgt_arr
Exemplo n.º 14
0
def do_target(bait_arr,
              annotate=None,
              do_short_names=False,
              do_split=False,
              avg_size=200 / .75):
    """Transform bait intervals into targets more suitable for CNVkit."""
    tgt_arr = bait_arr.copy()
    # Drop zero-width regions
    tgt_arr = tgt_arr[tgt_arr.start != tgt_arr.end]
    if do_split:
        logging.info("Splitting large targets")
        tgt_arr = tgt_arr.subdivide(avg_size, 0)
    if annotate:
        logging.info("Applying annotations as target names")
        annotation = tabio.read_auto(annotate)
        antitarget.compare_chrom_names(tgt_arr, annotation)
        tgt_arr['gene'] = annotation.into_ranges(tgt_arr, 'gene', '-')
    if do_short_names:
        logging.info("Shortening target interval labels")
        tgt_arr['gene'] = list(shorten_labels(tgt_arr['gene']))
    return tgt_arr
Exemplo n.º 15
0
 def test_target(self):
     """The 'target' command."""
     annot_fname = "formats/refflat-mini.txt"
     for bait_fname in ("formats/nv2_baits.interval_list",
                        "formats/amplicon.bed"):
         baits = tabio.read_auto(bait_fname)
         bait_len = len(baits)
         # No splitting: w/ and w/o re-annotation
         r1 = commands.do_target(baits)
         self.assertEqual(len(r1), bait_len)
         r1a = commands.do_target(baits, do_short_names=True,
                                  annotate=annot_fname)
         self.assertEqual(len(r1a), len(r1))
         # Splitting
         r2 = commands.do_target(baits, do_short_names=True, do_split=True,
                                 avg_size=100)
         self.assertGreater(len(r2), len(r1))
         r2a = commands.do_target(baits, do_short_names=True, do_split=True,
                                  avg_size=100, annotate=annot_fname)
         self.assertEqual(len(r2a), len(r2))
         # Original regions object should be unmodified
         self.assertEqual(len(baits), bait_len)
Exemplo n.º 16
0
 def test_target(self):
     """The 'target' command."""
     #return  # DBG
     annot_fname = "formats/refflat-mini.txt"
     for bait_fname in ("formats/nv2_baits.interval_list",
                        "formats/amplicon.bed", "formats/baits-funky.bed"):
         baits = tabio.read_auto(bait_fname)
         bait_len = len(baits)
         # No splitting: w/o and w/ re-annotation
         r1 = commands.do_target(baits)
         self.assertEqual(len(r1), bait_len)
         r1a = commands.do_target(baits,
                                  do_short_names=True,
                                  annotate=annot_fname)
         self.assertEqual(len(r1a), len(r1))
         # Splitting, w/o and w/ re-annotation
         r2 = commands.do_target(baits,
                                 do_short_names=True,
                                 do_split=True,
                                 avg_size=100)
         self.assertGreater(len(r2), len(r1))
         for _c, subarr in r2.by_chromosome():
             self.assertTrue(subarr.start.is_monotonic_increasing,
                             bait_fname)
             self.assertTrue(subarr.end.is_monotonic_increasing, bait_fname)
             # Bins are non-overlapping; next start >= prev. end
             self.assertTrue(
                 ((subarr.start.values[1:] - subarr.end.values[:-1]) >=
                  0).all())
         r2a = commands.do_target(baits,
                                  do_short_names=True,
                                  do_split=True,
                                  avg_size=100,
                                  annotate=annot_fname)
         self.assertEqual(len(r2a), len(r2))
         # Original regions object should be unmodified
         self.assertEqual(len(baits), bait_len)
Exemplo n.º 17
0
def batch_make_reference(normal_bams, target_bed, antitarget_bed,
                         male_reference, fasta, annotate, short_names,
                         target_avg_size, access_bed, antitarget_avg_size,
                         antitarget_min_size, output_reference, output_dir,
                         processes, by_count, method):
    """Build the CN reference from normal samples, targets and antitargets."""
    if method in ("wgs", "amplicon"):
        if antitarget_bed:
            raise ValueError("%r protocol: antitargets should not be "
                             "given/specified." % method)
        if access_bed and target_bed and access_bed != target_bed:
            raise ValueError("%r protocol: targets and access should not be "
                             "different." % method)

    bait_arr = None
    if method == "wgs":
        if not annotate:
            # TODO check if target_bed has gene names
            logging.warning("WGS protocol: recommend '--annotate' option "
                            "(e.g. refFlat.txt) to help locate genes "
                            "in output files.")
        access_arr = None
        if not target_bed:
            # TODO - drop weird contigs before writing, see antitargets.py
            if access_bed:
                target_bed = access_bed
            elif fasta:
                # Run 'access' on the fly
                access_arr = access.do_access(fasta)
                # Take filename base from FASTA, lacking any other clue
                target_bed = os.path.splitext(
                    os.path.basename(fasta))[0] + ".bed"
                tabio.write(access_arr, target_bed, "bed3")
            else:
                raise ValueError("WGS protocol: need to provide --targets, "
                                 "--access, or --fasta options.")

        # Tweak default parameters
        if not target_avg_size:
            if normal_bams:
                # Calculate bin size from .bai & access
                if fasta and not access_arr:
                    # Calculate wgs depth from all
                    # sequencing-accessible area (it doesn't take that long
                    # compared to WGS coverage); user-provided access might be
                    # something else that excludes a significant number of
                    # mapped reads.
                    access_arr = access.do_access(fasta)
                if access_arr:
                    autobin_args = ['wgs', access_arr]
                else:
                    # Don't assume the given targets/access covers the whole
                    # genome; use autobin sampling to estimate bin size, as we
                    # do for amplicon
                    bait_arr = tabio.read_auto(target_bed)
                    autobin_args = ['amplicon', bait_arr]
                # Choose median-size normal bam or tumor bam
                bam_fname = autobin.midsize_file(normal_bams)
                (wgs_depth,
                 target_avg_size), _ = autobin.do_autobin(bam_fname,
                                                          *autobin_args,
                                                          bp_per_bin=50000.)
                logging.info("WGS average depth %.2f --> using bin size %d",
                             wgs_depth, target_avg_size)
            else:
                # This bin size is OK down to 10x
                target_avg_size = 5000

    # To make temporary filenames for processed targets or antitargets
    tgt_name_base, _tgt_ext = os.path.splitext(os.path.basename(target_bed))
    if output_dir:
        tgt_name_base = os.path.join(output_dir, tgt_name_base)

    # Pre-process baits/targets
    new_target_fname = tgt_name_base + '.target.bed'
    if bait_arr is None:
        bait_arr = tabio.read_auto(target_bed)
    target_arr = target.do_target(
        bait_arr, annotate, short_names, True,
        **({
            'avg_size': target_avg_size
        } if target_avg_size else {}))
    tabio.write(target_arr, new_target_fname, 'bed4')
    target_bed = new_target_fname

    if not antitarget_bed:
        # Devise a temporary antitarget filename
        antitarget_bed = tgt_name_base + '.antitarget.bed'
        if method == "hybrid":
            # Build antitarget BED from the given targets
            anti_kwargs = {}
            if access_bed:
                anti_kwargs['access'] = tabio.read_auto(access_bed)
            if antitarget_avg_size:
                anti_kwargs['avg_bin_size'] = antitarget_avg_size
            if antitarget_min_size:
                anti_kwargs['min_bin_size'] = antitarget_min_size
            anti_arr = antitarget.do_antitarget(target_arr, **anti_kwargs)
        else:
            # No antitargets for wgs, amplicon
            anti_arr = GA([])
        tabio.write(anti_arr, antitarget_bed, "bed4")

    if len(normal_bams) == 0:
        logging.info("Building a flat reference...")
        ref_arr = reference.do_reference_flat(target_bed, antitarget_bed,
                                              fasta, male_reference)
    else:
        logging.info("Building a copy number reference from normal samples...")
        # Run coverage on all normals
        with parallel.pick_pool(processes) as pool:
            tgt_futures = []
            anti_futures = []
            procs_per_cnn = max(1, processes // (2 * len(normal_bams)))
            for nbam in normal_bams:
                sample_id = core.fbase(nbam)
                sample_pfx = os.path.join(output_dir, sample_id)
                tgt_futures.append(
                    pool.submit(batch_write_coverage, target_bed, nbam,
                                sample_pfx + '.targetcoverage.cnn', by_count,
                                procs_per_cnn))
                anti_futures.append(
                    pool.submit(batch_write_coverage, antitarget_bed, nbam,
                                sample_pfx + '.antitargetcoverage.cnn',
                                by_count, procs_per_cnn))

        target_fnames = [tf.result() for tf in tgt_futures]
        antitarget_fnames = [af.result() for af in anti_futures]
        # Build reference from *.cnn
        ref_arr = reference.do_reference(target_fnames,
                                         antitarget_fnames,
                                         fasta,
                                         male_reference,
                                         None,
                                         do_gc=True,
                                         do_edge=(method == "hybrid"),
                                         do_rmask=True)
    if not output_reference:
        output_reference = os.path.join(output_dir, "reference.cnn")
    core.ensure_path(output_reference)
    tabio.write(ref_arr, output_reference)
    return output_reference, target_bed, antitarget_bed
Exemplo n.º 18
0
def batch_make_reference(normal_bams, target_bed, antitarget_bed,
                         male_reference, fasta, annotate, short_names,
                         target_avg_size, access_bed, antitarget_avg_size,
                         antitarget_min_size, output_reference, output_dir,
                         processes, by_count, method):
    """Build the CN reference from normal samples, targets and antitargets."""
    if method in ("wgs", "amplicon"):
        if antitarget_bed:
            raise ValueError("%r protocol: antitargets should not be "
                             "given/specified." % method)
        if access_bed and target_bed and access_bed != target_bed:
            raise ValueError("%r protocol: targets and access should not be "
                             "different." % method)

    bait_arr = None
    if method == "wgs":
        if not annotate:
            # TODO check if target_bed has gene names
            logging.warning("WGS protocol: recommend '--annotate' option "
                            "(e.g. refFlat.txt) to help locate genes "
                            "in output files.")
        access_arr = None
        if not target_bed:
            # TODO - drop weird contigs before writing, see antitargets.py
            if access_bed:
                target_bed = access_bed
            elif fasta:
                # Run 'access' on the fly
                access_arr = access.do_access(fasta)
                # Take filename base from FASTA, lacking any other clue
                target_bed = os.path.splitext(os.path.basename(fasta)
                                             )[0] + ".bed"
                tabio.write(access_arr, target_bed, "bed3")
            else:
                raise ValueError("WGS protocol: need to provide --targets, "
                                 "--access, or --fasta options.")

        # Tweak default parameters
        if not target_avg_size:
            if normal_bams:
                # Calculate bin size from .bai & access
                if fasta and not access_arr:
                    # Calculate wgs depth from all
                    # sequencing-accessible area (it doesn't take that long
                    # compared to WGS coverage); user-provided access might be
                    # something else that excludes a significant number of
                    # mapped reads.
                    access_arr = access.do_access(fasta)
                if access_arr:
                    autobin_args = ['wgs', None, access_arr]
                else:
                    # Don't assume the given targets/access covers the whole
                    # genome; use autobin sampling to estimate bin size, as we
                    # do for amplicon
                    bait_arr = tabio.read_auto(target_bed)
                    autobin_args = ['amplicon', bait_arr]
                # Choose median-size normal bam or tumor bam
                bam_fname = autobin.midsize_file(normal_bams)
                (wgs_depth, target_avg_size), _ = autobin.do_autobin(
                    bam_fname, *autobin_args, bp_per_bin=50000.)
                logging.info("WGS average depth %.2f --> using bin size %d",
                             wgs_depth, target_avg_size)
            else:
                # This bin size is OK down to 10x
                target_avg_size = 5000

    # To make temporary filenames for processed targets or antitargets
    tgt_name_base, _tgt_ext = os.path.splitext(os.path.basename(target_bed))
    if output_dir:
        tgt_name_base = os.path.join(output_dir, tgt_name_base)

    # Pre-process baits/targets
    new_target_fname = tgt_name_base + '.target.bed'
    if bait_arr is None:
        bait_arr = tabio.read_auto(target_bed)
    target_arr = target.do_target(bait_arr, annotate, short_names, True,
                                  **({'avg_size': target_avg_size}
                                     if target_avg_size
                                     else {}))
    tabio.write(target_arr, new_target_fname, 'bed4')
    target_bed = new_target_fname

    if not antitarget_bed:
        # Devise a temporary antitarget filename
        antitarget_bed = tgt_name_base + '.antitarget.bed'
        if method == "hybrid":
            # Build antitarget BED from the given targets
            anti_kwargs = {}
            if access_bed:
                anti_kwargs['access'] = tabio.read_auto(access_bed)
            if antitarget_avg_size:
                anti_kwargs['avg_bin_size'] = antitarget_avg_size
            if antitarget_min_size:
                anti_kwargs['min_bin_size'] = antitarget_min_size
            anti_arr = antitarget.do_antitarget(target_arr, **anti_kwargs)
        else:
            # No antitargets for wgs, amplicon
            anti_arr = GA([])
        tabio.write(anti_arr, antitarget_bed, "bed4")

    if len(normal_bams) == 0:
        logging.info("Building a flat reference...")
        ref_arr = reference.do_reference_flat(target_bed, antitarget_bed, fasta,
                                              male_reference)
    else:
        logging.info("Building a copy number reference from normal samples...")
        # Run coverage on all normals
        with parallel.pick_pool(processes) as pool:
            tgt_futures = []
            anti_futures = []
            procs_per_cnn = max(1, processes // (2 * len(normal_bams)))
            for nbam in normal_bams:
                sample_id = core.fbase(nbam)
                sample_pfx = os.path.join(output_dir, sample_id)
                tgt_futures.append(
                    pool.submit(batch_write_coverage,
                                target_bed, nbam,
                                sample_pfx + '.targetcoverage.cnn',
                                by_count, procs_per_cnn))
                anti_futures.append(
                    pool.submit(batch_write_coverage,
                                antitarget_bed, nbam,
                                sample_pfx + '.antitargetcoverage.cnn',
                                by_count, procs_per_cnn))

        target_fnames = [tf.result() for tf in tgt_futures]
        antitarget_fnames = [af.result() for af in anti_futures]
        # Build reference from *.cnn
        ref_arr = reference.do_reference(target_fnames, antitarget_fnames,
                                         fasta, male_reference, None,
                                         do_gc=True,
                                         do_edge=(method == "hybrid"),
                                         do_rmask=True)
    if not output_reference:
        output_reference = os.path.join(output_dir, "reference.cnn")
    core.ensure_path(output_reference)
    tabio.write(ref_arr, output_reference)
    return output_reference, target_bed, antitarget_bed
Exemplo n.º 19
0
def main(args):
    annot = tabio.read_auto(args.annotate)
    cnarr = read_cna(args.cnv_file)
    cnarr['gene'] = annot.into_ranges(cnarr, 'gene', '-')
    tabio.write(cnarr, args.output or sys.stdout)
Exemplo n.º 20
0
def main(args):
    annot = tabio.read_auto(args.annotate)
    cnarr = read_cna(args.cnv_file)
    cnarr['gene'] = annot.into_ranges(cnarr, 'gene', '-')
    tabio.write(cnarr, args.output or sys.stdout)