Пример #1
0
def main(args):
    logging.info("Converting %s%s to %s",
                 "input" if args.infile is sys.stdin else args.infile,
                 " from "+ args.in_fmt if args.in_fmt != 'auto' else '',
                 args.out_fmt)

    if args.in_fmt == 'auto':
        args.in_fmt = tabio.sniff_region_format(args.infile)
    # Format-specific input options
    kwargs = {}
    if args.in_fmt == 'gff':
        if args.gff_tag:
            kwargs['tag'] = args.gff_tag
        if args.gff_type:
            kwargs['keep_type'] = args.gff_type
    elif args.in_fmt == 'refflat':
        if args.refflat_type == 'exon':
            kwargs['exons'] = True
        elif args.refflat_type == 'cds':
            kwargs['cds'] = True
    regions = tabio.read(args.infile, args.in_fmt, **kwargs)

    # Post-processing
    if args.flatten:
        regions = regions.flatten()
    elif args.merge:
        regions = regions.merge(bp=args.merge)

    tabio.write(regions, args.output, args.out_fmt)
Пример #2
0
def main(args):
    logging.info("Converting %s%s to %s",
                 "input" if args.infile is sys.stdin else args.infile,
                 "from "+ args.in_fmt if args.in_fmt != 'auto' else '',
                 args.out_fmt)

    table = tabio.read(args.infile, args.in_fmt)
    tabio.write(table, args.output, args.out_fmt)
Пример #3
0
def sample_region_cov(bam_fname, regions, max_num=100):
    """Calculate read depth in a randomly sampled subset of regions."""
    midsize_regions = sample_midsize_regions(regions, max_num)
    with tempfile.NamedTemporaryFile(suffix='.bed', mode='w+t') as f:
        tabio.write(regions.as_dataframe(midsize_regions), f, 'bed4')
        f.flush()
        table = coverage.bedcov(f.name, bam_fname, 0)
    # Mean read depth across all sampled regions
    return table.basecount.sum() / (table.end - table.start).sum()
Пример #4
0
def main(args):
    logging.info("Converting %s%s to %s",
                 "input" if args.infile is sys.stdin else args.infile,
                 "from " + args.in_fmt if args.in_fmt != 'auto' else '',
                 args.out_fmt)

    # TODO - add back merge/flatten/exon options from refFlat2bed
    table = tabio.read(args.infile, args.in_fmt)
    tabio.write(table, args.output, args.out_fmt)
Пример #5
0
def sample_region_cov(bam_fname, regions, max_num=100):
    """Calculate read depth in a randomly sampled subset of regions."""
    midsize_regions = sample_midsize_regions(regions, max_num)
    with tempfile.NamedTemporaryFile(suffix='.bed', mode='w+t') as f:
        tabio.write(regions.as_dataframe(midsize_regions), f, 'bed4')
        f.flush()
        table = coverage.bedcov(f.name, bam_fname, 0)
    # Mean read depth across all sampled regions
    return table.basecount.sum() / (table.end - table.start).sum()
Пример #6
0
def _cmd_ztest(args):
    cnarr = cnvlib.read(args.cnarr)
    if args.segment:
        segments = cnvlib.read(args.segment)
        is_sample_female = None
    else:
        segments = None
        is_sample_female = verify_sample_sex(cnarr, args.sample_sex,
                                             args.male_reference)
    sig = do_ztest(cnarr, segments, args.male_reference, is_sample_female,
                   args.alpha, args.target)
    if len(sig):
        tabio.write(sig, args.output or sys.stdout)
Пример #7
0
def _cmd_ztest(args):
    cnarr = cnvlib.read(args.cnarr)
    if args.segment:
        segments = cnvlib.read(args.segment)
        is_sample_female = None
    else:
        segments = None
        is_sample_female = verify_sample_sex(cnarr, args.sample_sex,
                                             args.male_reference)
    sig = do_ztest(cnarr, segments, args.male_reference, is_sample_female,
                   args.alpha, args.target)
    if len(sig):
        tabio.write(sig, args.output or sys.stdout)
Пример #8
0
def main(args):
    sample_counts = aggregate_gene_counts(args.gene_counts)
    sample_counts = rna.filter_probes(sample_counts)
    # DBG
    if args.output:
        sample_counts.to_csv(args.output + ".sample_counts.tsv",
                             sep='\t',
                             index=True)
        print("Wrote", args.output + ".sample_counts.tsv", "with",
              len(sample_counts), "rows")

    if args.correlations:
        logging.info("Loading gene metadata "
                     "and TCGA gene expression/CNV profiles")
    else:
        logging.info("Loading gene metadata")

    gene_info = rna.load_gene_info(args.gene_resource, args.correlations)

    print("Aligning gene info to sample gene counts")
    gene_info, sample_counts, sample_data_log2 = rna.align_gene_info_to_samples(
        gene_info, sample_counts, None)

    print("Writing output files")
    # Summary table has log2-normalized values, not raw counts
    # ENH show both, with column header suffixes to distinguish?
    all_data = pd.concat([gene_info, sample_data_log2], axis=1)
    if args.output:
        all_data.to_csv(args.output, sep='\t', index=True)
        print("Wrote", args.output, "with", len(all_data), "rows")
    else:
        print(all_data.describe(), file=sys.stderr)

    if args.cnr_dir:
        # CNVkit files have both absolute and log2-normalized read counts
        cnrs = rna.attach_gene_info_to_cnr(sample_counts, sample_data_log2,
                                           gene_info)
        for cnr in cnrs:
            outfname = os.path.join(args.cnr_dir, cnr.sample_id + ".cnr")
            cnr = rna.correct_cnr(cnr)
            tabio.write(cnr, outfname, 'tab')
Пример #9
0
def batch_run_sample(bam_fname, target_bed, antitarget_bed, ref_fname,
                     output_dir, male_reference, plot_scatter, plot_diagram,
                     rlibpath, by_count, skip_low, method, processes):
    """Run the pipeline on one BAM file."""
    # ENH - return probes, segments (cnarr, segarr)
    logging.info("Running the CNVkit pipeline on %s ...", bam_fname)
    sample_id = core.fbase(bam_fname)
    sample_pfx = os.path.join(output_dir, sample_id)

    raw_tgt = coverage.do_coverage(target_bed, bam_fname, by_count, 0,
                                   processes)
    tabio.write(raw_tgt, sample_pfx + '.targetcoverage.cnn')

    raw_anti = coverage.do_coverage(antitarget_bed, bam_fname, by_count, 0,
                                    processes)
    tabio.write(raw_anti, sample_pfx + '.antitargetcoverage.cnn')

    cnarr = fix.do_fix(raw_tgt,
                       raw_anti,
                       read_cna(ref_fname),
                       do_gc=True,
                       do_edge=(method == "hybrid"),
                       do_rmask=True)
    tabio.write(cnarr, sample_pfx + '.cnr')

    logging.info("Segmenting %s.cnr ...", sample_pfx)
    segments = segmentation.do_segmentation(cnarr,
                                            'cbs',
                                            rlibpath=rlibpath,
                                            skip_low=skip_low,
                                            processes=processes,
                                            **({
                                                'threshold': 1e-6
                                            } if method == 'wgs' else {}))
    tabio.write(segments, sample_pfx + '.cns')

    if plot_scatter:
        scatter.do_scatter(cnarr, segments)
        pyplot.savefig(sample_pfx + '-scatter.pdf',
                       format='pdf',
                       bbox_inches="tight")
        logging.info("Wrote %s-scatter.pdf", sample_pfx)

    if plot_diagram:
        is_xx = cnarr.guess_xx(male_reference)
        outfname = sample_pfx + '-diagram.pdf'
        diagram.create_diagram(cnarr.shift_xx(male_reference, is_xx),
                               segments.shift_xx(male_reference, is_xx), 0.5,
                               3, outfname)
        logging.info("Wrote %s", outfname)
Пример #10
0
def batch_run_sample(bam_fname, target_bed, antitarget_bed, ref_fname,
                     output_dir, male_reference, plot_scatter, plot_diagram,
                     rlibpath, by_count, skip_low, method, processes):
    """Run the pipeline on one BAM file."""
    # ENH - return probes, segments (cnarr, segarr)
    logging.info("Running the CNVkit pipeline on %s ...", bam_fname)
    sample_id = core.fbase(bam_fname)
    sample_pfx = os.path.join(output_dir, sample_id)

    raw_tgt = coverage.do_coverage(target_bed, bam_fname, by_count, 0,
                                   processes)
    tabio.write(raw_tgt, sample_pfx + '.targetcoverage.cnn')

    raw_anti = coverage.do_coverage(antitarget_bed, bam_fname, by_count, 0,
                                    processes)
    tabio.write(raw_anti, sample_pfx + '.antitargetcoverage.cnn')

    cnarr = fix.do_fix(raw_tgt, raw_anti, read_cna(ref_fname),
                       do_gc=True, do_edge=(method == "hybrid"), do_rmask=True)
    tabio.write(cnarr, sample_pfx + '.cnr')

    logging.info("Segmenting %s.cnr ...", sample_pfx)
    segments = segmentation.do_segmentation(cnarr, 'cbs',
                                            rlibpath=rlibpath,
                                            skip_low=skip_low,
                                            processes=processes,
                                            **({'threshold': 1e-6}
                                               if method == 'wgs'
                                               else {}))
    tabio.write(segments, sample_pfx + '.cns')

    if plot_scatter:
        scatter.do_scatter(cnarr, segments)
        pyplot.savefig(sample_pfx + '-scatter.pdf', format='pdf',
                       bbox_inches="tight")
        logging.info("Wrote %s-scatter.pdf", sample_pfx)

    if plot_diagram:
        is_xx = cnarr.guess_xx(male_reference)
        outfname = sample_pfx + '-diagram.pdf'
        diagram.create_diagram(cnarr.shift_xx(male_reference, is_xx),
                               segments.shift_xx(male_reference, is_xx),
                               0.5, 3, outfname)
        logging.info("Wrote %s", outfname)
Пример #11
0
        '-g',
        '--gene-resource',
        metavar="FILE",
        required=True,
        # default="data/ensembl-gene-info.hg38.tsv",
        help="Ensembl BioMart-derived gene info table.")
    AP.add_argument('-d',
                    '--output-dir',
                    metavar='PATH',
                    default='.',
                    help="Output directory.")

    args = AP.parse_args()
    gene_info = load_gene_info(args.gene_resource, None, None)
    bad_genes = [
        'Metazoa_SRP', '5S_rRNA', 'Y_RNA', 'U1', 'U2', 'U3', 'U4', 'U5', 'U6',
        'U7', 'U8', 'uc_338', 'Clostridiales-1'
    ]
    gene_info = gene_info[~gene_info['gene'].isin(bad_genes)]
    gene_info = GA(gene_info.loc[:, ('chromosome', 'start', 'end', 'gene')])

    for seg_fname in args.seg_files:
        seg = tabio.read(seg_fname, 'seg')
        # Assign gene names to segments using genomic coordinates from gene_info
        seg['gene'] = genes_in_segments(seg, gene_info)

        outfname = os.path.join(args.output_dir,
                                basename(seg_fname) + ".acgh.cns")
        tabio.write(seg, outfname, 'tab')
        print("Wrote", outfname)
Пример #12
0
def batch_run_sample(bam_fname, target_bed, antitarget_bed, ref_fname,
                     output_dir, male_reference, plot_scatter, plot_diagram,
                     rscript_path, by_count, skip_low, seq_method,
                     segment_method, processes, do_cluster, fasta=None):
    """Run the pipeline on one BAM file."""
    # ENH - return probes, segments (cnarr, segarr)
    logging.info("Running the CNVkit pipeline on %s ...", bam_fname)
    sample_id = core.fbase(bam_fname)
    sample_pfx = os.path.join(output_dir, sample_id)

    raw_tgt = coverage.do_coverage(target_bed, bam_fname, by_count, 0,
                                   processes, fasta)
    tabio.write(raw_tgt, sample_pfx + '.targetcoverage.cnn')

    raw_anti = coverage.do_coverage(antitarget_bed, bam_fname, by_count, 0,
                                    processes, fasta)
    tabio.write(raw_anti, sample_pfx + '.antitargetcoverage.cnn')

    cnarr = fix.do_fix(raw_tgt, raw_anti, read_cna(ref_fname),
                       do_gc=True, do_edge=(seq_method == "hybrid"), do_rmask=True,
                       do_cluster=do_cluster)
    tabio.write(cnarr, sample_pfx + '.cnr')

    logging.info("Segmenting %s.cnr ...", sample_pfx)
    segments = segmentation.do_segmentation(cnarr, segment_method,
                                            rscript_path=rscript_path,
                                            skip_low=skip_low,
                                            processes=processes,
                                            **({'threshold': 1e-6}
                                               if seq_method == 'wgs'
                                               else {}))
    logging.info("Post-processing %s.cns ...", sample_pfx)
    # TODO/ENH take centering shift & apply to .cnr for use in segmetrics
    seg_metrics = segmetrics.do_segmetrics(cnarr, segments,
                                           interval_stats=['ci'], alpha=0.5,
                                           smoothed=True)
    tabio.write(seg_metrics, sample_pfx + '.cns')

    # Remove likely false-positive breakpoints
    seg_call = call.do_call(seg_metrics, method="none", filters=['ci'])
    # Calculate another segment-level test p-value
    seg_alltest = segmetrics.do_segmetrics(cnarr, seg_call, location_stats=['p_ttest'])
    # Finally, assign absolute copy number values to each segment
    seg_alltest.center_all("median")
    seg_final = call.do_call(seg_alltest, method="threshold")
    tabio.write(seg_final, sample_pfx + '.call.cns')

    # Test for single-bin CNVs separately
    seg_bintest = bintest.do_bintest(cnarr, seg_call, target_only=True)
    tabio.write(seg_bintest, sample_pfx + '.bintest.cns')

    if plot_scatter:
        scatter.do_scatter(cnarr, seg_final)
        pyplot.savefig(sample_pfx + '-scatter.png', format='png',
                       bbox_inches="tight")
        logging.info("Wrote %s-scatter.png", sample_pfx)

    if plot_diagram:
        is_xx = cnarr.guess_xx(male_reference)
        outfname = sample_pfx + '-diagram.pdf'
        diagram.create_diagram(cnarr.shift_xx(male_reference, is_xx),
                               seg_final.shift_xx(male_reference, is_xx),
                               0.5, 3, outfname)
        logging.info("Wrote %s", outfname)
Пример #13
0
def main(args):
    annot = tabio.read_auto(args.annotate)
    cnarr = read_cna(args.cnv_file)
    cnarr['gene'] = annot.into_ranges(cnarr, 'gene', '-')
    tabio.write(cnarr, args.output or sys.stdout)
Пример #14
0
AP.add_argument("cnn_files", nargs='+',
        help="""CNVkit coverage files to update (*.targetcoverage.cnn,
                *.antitargetcoverage.cnn).""")
AP.add_argument("-d", "--output-dir", default=".",
        help="""Directory to write output .cnn files.""")
AP.add_argument("-s", "--suffix", default=".updated",
        help="""Filename suffix to add before the '.cnn' extension in output
                files. [Default: %(default)s]""")
args = AP.parse_args()

for fname in args.cnn_files:
    cnarr = cnvlib.read(fname)
    # Convert coverage depths from log2 scale to absolute scale.
    # NB: The log2 values are un-centered in CNVkit v0.7.0(?) through v0.7.11;
    # earlier than that, the average 'depth' will be about 1.0.
    cnarr['depth'] = np.exp2(cnarr['log2'])
    # Rename "Background" bins to "Antitarget"
    # NB: The default off-target bin name was changed in CNVkit v0.9.0
    cnarr['gene'] = cnarr['gene'].replace("Background",
                                          cnvlib.params.ANTITARGET_NAME)
    cnarr.sort_columns()
    # Construct the output filename
    base, ext = os.path.basename(fname).rsplit('.', 1)
    if '.' in base:
        base, zone = base.rsplit('.', 1)
        out_fname = '.'.join((base + args.suffix, zone, ext))
    else:
        # e.g. reference.cnn or .cnr file, no "*.targetcoverage.*" in name
        out_fname = '.'.join((base + args.suffix, ext))
    tabio.write(cnarr, os.path.join(args.output_dir, out_fname))
Пример #15
0
def batch_make_reference(normal_bams, target_bed, antitarget_bed,
                         male_reference, fasta, annotate, short_names,
                         target_avg_size, access_bed, antitarget_avg_size,
                         antitarget_min_size, output_reference, output_dir,
                         processes, by_count, method):
    """Build the CN reference from normal samples, targets and antitargets."""
    if method in ("wgs", "amplicon"):
        if antitarget_bed:
            raise ValueError("%r protocol: antitargets should not be "
                             "given/specified." % method)
        if access_bed and target_bed and access_bed != target_bed:
            raise ValueError("%r protocol: targets and access should not be "
                             "different." % method)

    bait_arr = None
    if method == "wgs":
        if not annotate:
            # TODO check if target_bed has gene names
            logging.warning("WGS protocol: recommend '--annotate' option "
                            "(e.g. refFlat.txt) to help locate genes "
                            "in output files.")
        access_arr = None
        if not target_bed:
            # TODO - drop weird contigs before writing, see antitargets.py
            if access_bed:
                target_bed = access_bed
            elif fasta:
                # Run 'access' on the fly
                access_arr = access.do_access(fasta)
                # Take filename base from FASTA, lacking any other clue
                target_bed = os.path.splitext(
                    os.path.basename(fasta))[0] + ".bed"
                tabio.write(access_arr, target_bed, "bed3")
            else:
                raise ValueError("WGS protocol: need to provide --targets, "
                                 "--access, or --fasta options.")

        # Tweak default parameters
        if not target_avg_size:
            if normal_bams:
                # Calculate bin size from .bai & access
                if fasta and not access_arr:
                    # Calculate wgs depth from all
                    # sequencing-accessible area (it doesn't take that long
                    # compared to WGS coverage); user-provided access might be
                    # something else that excludes a significant number of
                    # mapped reads.
                    access_arr = access.do_access(fasta)
                if access_arr:
                    autobin_args = ['wgs', access_arr]
                else:
                    # Don't assume the given targets/access covers the whole
                    # genome; use autobin sampling to estimate bin size, as we
                    # do for amplicon
                    bait_arr = tabio.read_auto(target_bed)
                    autobin_args = ['amplicon', bait_arr]
                # Choose median-size normal bam or tumor bam
                bam_fname = autobin.midsize_file(normal_bams)
                (wgs_depth,
                 target_avg_size), _ = autobin.do_autobin(bam_fname,
                                                          *autobin_args,
                                                          bp_per_bin=50000.)
                logging.info("WGS average depth %.2f --> using bin size %d",
                             wgs_depth, target_avg_size)
            else:
                # This bin size is OK down to 10x
                target_avg_size = 5000

    # To make temporary filenames for processed targets or antitargets
    tgt_name_base, _tgt_ext = os.path.splitext(os.path.basename(target_bed))
    if output_dir:
        tgt_name_base = os.path.join(output_dir, tgt_name_base)

    # Pre-process baits/targets
    new_target_fname = tgt_name_base + '.target.bed'
    if bait_arr is None:
        bait_arr = tabio.read_auto(target_bed)
    target_arr = target.do_target(
        bait_arr, annotate, short_names, True,
        **({
            'avg_size': target_avg_size
        } if target_avg_size else {}))
    tabio.write(target_arr, new_target_fname, 'bed4')
    target_bed = new_target_fname

    if not antitarget_bed:
        # Devise a temporary antitarget filename
        antitarget_bed = tgt_name_base + '.antitarget.bed'
        if method == "hybrid":
            # Build antitarget BED from the given targets
            anti_kwargs = {}
            if access_bed:
                anti_kwargs['access'] = tabio.read_auto(access_bed)
            if antitarget_avg_size:
                anti_kwargs['avg_bin_size'] = antitarget_avg_size
            if antitarget_min_size:
                anti_kwargs['min_bin_size'] = antitarget_min_size
            anti_arr = antitarget.do_antitarget(target_arr, **anti_kwargs)
        else:
            # No antitargets for wgs, amplicon
            anti_arr = GA([])
        tabio.write(anti_arr, antitarget_bed, "bed4")

    if len(normal_bams) == 0:
        logging.info("Building a flat reference...")
        ref_arr = reference.do_reference_flat(target_bed, antitarget_bed,
                                              fasta, male_reference)
    else:
        logging.info("Building a copy number reference from normal samples...")
        # Run coverage on all normals
        with parallel.pick_pool(processes) as pool:
            tgt_futures = []
            anti_futures = []
            procs_per_cnn = max(1, processes // (2 * len(normal_bams)))
            for nbam in normal_bams:
                sample_id = core.fbase(nbam)
                sample_pfx = os.path.join(output_dir, sample_id)
                tgt_futures.append(
                    pool.submit(batch_write_coverage, target_bed, nbam,
                                sample_pfx + '.targetcoverage.cnn', by_count,
                                procs_per_cnn))
                anti_futures.append(
                    pool.submit(batch_write_coverage, antitarget_bed, nbam,
                                sample_pfx + '.antitargetcoverage.cnn',
                                by_count, procs_per_cnn))

        target_fnames = [tf.result() for tf in tgt_futures]
        antitarget_fnames = [af.result() for af in anti_futures]
        # Build reference from *.cnn
        ref_arr = reference.do_reference(target_fnames,
                                         antitarget_fnames,
                                         fasta,
                                         male_reference,
                                         None,
                                         do_gc=True,
                                         do_edge=(method == "hybrid"),
                                         do_rmask=True)
    if not output_reference:
        output_reference = os.path.join(output_dir, "reference.cnn")
    core.ensure_path(output_reference)
    tabio.write(ref_arr, output_reference)
    return output_reference, target_bed, antitarget_bed
Пример #16
0
                default=".",
                help="""Directory to write output .cnn files.""")
AP.add_argument(
    "-s",
    "--suffix",
    default=".updated",
    help="""Filename suffix to add before the '.cnn' extension in output
                files. [Default: %(default)s]""")
args = AP.parse_args()

for fname in args.cnn_files:
    cnarr = cnvlib.read(fname)
    # Convert coverage depths from log2 scale to absolute scale.
    # NB: The log2 values are un-centered in CNVkit v0.7.0(?) through v0.7.11;
    # earlier than that, the average 'depth' will be about 1.0.
    cnarr['depth'] = np.exp2(cnarr['log2'])
    # Rename "Background" bins to "Antitarget"
    # NB: The default off-target bin name was changed in CNVkit v0.9.0
    cnarr['gene'] = cnarr['gene'].replace("Background",
                                          cnvlib.params.ANTITARGET_NAME)
    cnarr.sort_columns()
    # Construct the output filename
    base, ext = os.path.basename(fname).rsplit('.', 1)
    if '.' in base:
        base, zone = base.rsplit('.', 1)
        out_fname = '.'.join((base + args.suffix, zone, ext))
    else:
        # e.g. reference.cnn or .cnr file, no "*.targetcoverage.*" in name
        out_fname = '.'.join((base + args.suffix, ext))
    tabio.write(cnarr, os.path.join(args.output_dir, out_fname))
Пример #17
0

def clipped_rolling_mean(values, window):
    clipped = values.clip(-3, 3)
    smoothed = clipped.rolling(window, min_periods=1, center=True).mean()
    return smoothed.values


def smooth_by_arm(cnarr, window):
    logr_chunks = [clipped_rolling_mean(cnarm['log2'], window)
                   for _chrom, cnarm in cnarr.by_arm()]
    d = cnarr.data.assign(log2=np.concatenate(logr_chunks))
    return cnarr.as_dataframe(d)


AP = argparse.ArgumentParser(description=__doc__)
AP.add_argument('cnr_fnames', nargs='+')
AP.add_argument('-w', '--window', type=int, default=100,
                help="Window size for smoothing.")
AP.add_argument('-d', '--output-dir', default='.')
args = AP.parse_args()

for fname in args.cnr_fnames:
    cnr = cnvlib.read(fname)
    cnr = smooth_by_arm(cnr, args.window)
    base, ext = os.path.basename(fname).rsplit(".", 1)
    outfname = "{}/{}.tsmooth{}.{}".format(args.output_dir, base,
                                           args.window, ext)
    tabio.write(cnr, outfname)
    print("Wrote", outfname, file=sys.stderr)
Пример #18
0
def batch_write_coverage(bed_fname, bam_fname, out_fname, by_count, processes):
    """Run coverage on one sample, write to file."""
    cnarr = coverage.do_coverage(bed_fname, bam_fname, by_count, 0, processes)
    tabio.write(cnarr, out_fname)
    return out_fname
Пример #19
0
def batch_make_reference(normal_bams, target_bed, antitarget_bed,
                         male_reference, fasta, annotate, short_names,
                         target_avg_size, access_bed, antitarget_avg_size,
                         antitarget_min_size, output_reference, output_dir,
                         processes, by_count, method):
    """Build the CN reference from normal samples, targets and antitargets."""
    if method in ("wgs", "amplicon"):
        if antitarget_bed:
            raise ValueError("%r protocol: antitargets should not be "
                             "given/specified." % method)
        if access_bed and target_bed and access_bed != target_bed:
            raise ValueError("%r protocol: targets and access should not be "
                             "different." % method)

    bait_arr = None
    if method == "wgs":
        if not annotate:
            # TODO check if target_bed has gene names
            logging.warning("WGS protocol: recommend '--annotate' option "
                            "(e.g. refFlat.txt) to help locate genes "
                            "in output files.")
        access_arr = None
        if not target_bed:
            # TODO - drop weird contigs before writing, see antitargets.py
            if access_bed:
                target_bed = access_bed
            elif fasta:
                # Run 'access' on the fly
                access_arr = access.do_access(fasta)
                # Take filename base from FASTA, lacking any other clue
                target_bed = os.path.splitext(os.path.basename(fasta)
                                             )[0] + ".bed"
                tabio.write(access_arr, target_bed, "bed3")
            else:
                raise ValueError("WGS protocol: need to provide --targets, "
                                 "--access, or --fasta options.")

        # Tweak default parameters
        if not target_avg_size:
            if normal_bams:
                # Calculate bin size from .bai & access
                if fasta and not access_arr:
                    # Calculate wgs depth from all
                    # sequencing-accessible area (it doesn't take that long
                    # compared to WGS coverage); user-provided access might be
                    # something else that excludes a significant number of
                    # mapped reads.
                    access_arr = access.do_access(fasta)
                if access_arr:
                    autobin_args = ['wgs', None, access_arr]
                else:
                    # Don't assume the given targets/access covers the whole
                    # genome; use autobin sampling to estimate bin size, as we
                    # do for amplicon
                    bait_arr = tabio.read_auto(target_bed)
                    autobin_args = ['amplicon', bait_arr]
                # Choose median-size normal bam or tumor bam
                bam_fname = autobin.midsize_file(normal_bams)
                (wgs_depth, target_avg_size), _ = autobin.do_autobin(
                    bam_fname, *autobin_args, bp_per_bin=50000.)
                logging.info("WGS average depth %.2f --> using bin size %d",
                             wgs_depth, target_avg_size)
            else:
                # This bin size is OK down to 10x
                target_avg_size = 5000

    # To make temporary filenames for processed targets or antitargets
    tgt_name_base, _tgt_ext = os.path.splitext(os.path.basename(target_bed))
    if output_dir:
        tgt_name_base = os.path.join(output_dir, tgt_name_base)

    # Pre-process baits/targets
    new_target_fname = tgt_name_base + '.target.bed'
    if bait_arr is None:
        bait_arr = tabio.read_auto(target_bed)
    target_arr = target.do_target(bait_arr, annotate, short_names, True,
                                  **({'avg_size': target_avg_size}
                                     if target_avg_size
                                     else {}))
    tabio.write(target_arr, new_target_fname, 'bed4')
    target_bed = new_target_fname

    if not antitarget_bed:
        # Devise a temporary antitarget filename
        antitarget_bed = tgt_name_base + '.antitarget.bed'
        if method == "hybrid":
            # Build antitarget BED from the given targets
            anti_kwargs = {}
            if access_bed:
                anti_kwargs['access'] = tabio.read_auto(access_bed)
            if antitarget_avg_size:
                anti_kwargs['avg_bin_size'] = antitarget_avg_size
            if antitarget_min_size:
                anti_kwargs['min_bin_size'] = antitarget_min_size
            anti_arr = antitarget.do_antitarget(target_arr, **anti_kwargs)
        else:
            # No antitargets for wgs, amplicon
            anti_arr = GA([])
        tabio.write(anti_arr, antitarget_bed, "bed4")

    if len(normal_bams) == 0:
        logging.info("Building a flat reference...")
        ref_arr = reference.do_reference_flat(target_bed, antitarget_bed, fasta,
                                              male_reference)
    else:
        logging.info("Building a copy number reference from normal samples...")
        # Run coverage on all normals
        with parallel.pick_pool(processes) as pool:
            tgt_futures = []
            anti_futures = []
            procs_per_cnn = max(1, processes // (2 * len(normal_bams)))
            for nbam in normal_bams:
                sample_id = core.fbase(nbam)
                sample_pfx = os.path.join(output_dir, sample_id)
                tgt_futures.append(
                    pool.submit(batch_write_coverage,
                                target_bed, nbam,
                                sample_pfx + '.targetcoverage.cnn',
                                by_count, procs_per_cnn))
                anti_futures.append(
                    pool.submit(batch_write_coverage,
                                antitarget_bed, nbam,
                                sample_pfx + '.antitargetcoverage.cnn',
                                by_count, procs_per_cnn))

        target_fnames = [tf.result() for tf in tgt_futures]
        antitarget_fnames = [af.result() for af in anti_futures]
        # Build reference from *.cnn
        ref_arr = reference.do_reference(target_fnames, antitarget_fnames,
                                         fasta, male_reference, None,
                                         do_gc=True,
                                         do_edge=(method == "hybrid"),
                                         do_rmask=True)
    if not output_reference:
        output_reference = os.path.join(output_dir, "reference.cnn")
    core.ensure_path(output_reference)
    tabio.write(ref_arr, output_reference)
    return output_reference, target_bed, antitarget_bed
Пример #20
0
def genes_in_segments(segarr, gene_info):
    return gene_info.into_ranges(segarr, 'gene', '-', join_unique)


if __name__ == '__main__':
    AP = argparse.ArgumentParser(description=__doc__)
    AP.add_argument('seg_files', nargs='+',
                    help="Segmented aCGH data in SEG format.")
    AP.add_argument('-g', '--gene-resource', metavar="FILE", required=True,
                    # default="data/ensembl-gene-info.hg38.tsv",
                    help="Ensembl BioMart-derived gene info table.")
    AP.add_argument('-d', '--output-dir', metavar='PATH', default='.',
                    help="Output directory.")

    args = AP.parse_args()
    gene_info = load_gene_info(args.gene_resource, None, None)
    bad_genes = ['Metazoa_SRP', '5S_rRNA', 'Y_RNA', 'U1', 'U2', 'U3', 'U4',
                 'U5', 'U6', 'U7', 'U8', 'uc_338', 'Clostridiales-1']
    gene_info = gene_info[~gene_info['gene'] .isin(bad_genes)]
    gene_info = GA(gene_info.loc[:, ('chromosome', 'start', 'end', 'gene')])

    for seg_fname in args.seg_files:
        seg = tabio.read(seg_fname, 'seg')
        # Assign gene names to segments using genomic coordinates from gene_info
        seg['gene'] = genes_in_segments(seg, gene_info)

        outfname = os.path.join(args.output_dir,
                                basename(seg_fname) + ".acgh.cns")
        tabio.write(seg, outfname, 'tab')
        print("Wrote", outfname)
Пример #21
0
from cnvlib.params import NULL_LOG2_COVERAGE
from skgenome import tabio

READ_LENGTH = 150  # Not super important


def parse_coords(coords):
    chrom, rest = coords.split(':', 1)
    start, end = rest.split('-')
    return chrom, int(start) - 1, int(end)


table = pd.read_table(sys.argv[1])
chroms, starts, ends = zip(*table['Name'].apply(parse_coords))
depths = READ_LENGTH * table['NumReads'] / table['Length']
norm_depth = table['TPM'] / table['TPM'][depths > 0].median()
log2_ratios = safe_log2(norm_depth, NULL_LOG2_COVERAGE)
weights = table['EffectiveLength'] / table['EffectiveLength'].max()

cnarr = CNA.from_columns({
    'chromosome': chroms,
    'start': starts,  # np.array(starts) - 1,
    'end': ends,
    'gene': '-',
    'log2': log2_ratios,
    'depth': depths,
    'weight': weights,
})
cnarr.sort()
tabio.write(cnarr, sys.stdout)
Пример #22
0
def main(args):
    annot = tabio.read_auto(args.annotate)
    cnarr = read_cna(args.cnv_file)
    cnarr['gene'] = annot.into_ranges(cnarr, 'gene', '-')
    tabio.write(cnarr, args.output or sys.stdout)
Пример #23
0
import logging

from skgenome import tabio

logging.basicConfig(level=logging.INFO, format="%(message)s")

AP = argparse.ArgumentParser(description=__doc__)
AP.add_argument('refflat',
                help="UCSC refFlat.txt for the reference genome.")
AP.add_argument('-e', '--exons', action='store_true',
                help="""Emit each exon instead of the whole gene regions.""")
AP.add_argument('-f', '--flatten', action='store_true',
                help="""Flatten overlapping regions, keeping original
                boundaries. Not recommended with --exons.""")
AP.add_argument('-m', '--merge',
                metavar='BASEPAIRS', nargs='?', type=int, const=1,
                help="""Merge overlapping regions with different names.
                Recommended with --exons. Optional argument value is the
                number of overlapping bases between two regions to trigger a
                merge. [Default: %(const)s]""")
AP.add_argument('-o', '--output',
                help="Output filename.")
args = AP.parse_args()

regions = tabio.read(args.refflat, 'refflat', exons=args.exons)
if args.flatten:
    regions = regions.flatten()
elif args.merge:
    regions = regions.merge(bp=args.merge)
tabio.write(regions, args.output, 'bed4')
Пример #24
0
    def create_master_report(self, time_point, normal_time_point, report_file):
        master_metadata = []

        sample = self.patientid + "_" + time_point
        normal_sample = self.patientid + "_" + normal_time_point

        self.annodf = get_sample_info(self.patient_dir)
        self.annodf['gender'] = self.annodf.gender.replace('W',
                                                           'female').replace(
                                                               'M', 'male')

        if sample in corr_purity:
            logger.info("correcting purity from {} to {}".format(
                annodf.loc[sample].purity, corr_purity[sample]))
            self.annodf.set_value(sample, 'purity', corr_purity[sample])

        cnr_filename = join(self.out_folder, "{}.cnr".format(sample))

        logger.debug("Getting log2 ratio df for sample {}".format(sample))
        cnvkit_cnr = self.get_log2_ratio_df(sample)
        cnvkit_cns = self.get_segments_df(sample, cnvkit_cnr)
        cnvkit_vaf = self.get_cnvkit_vaf(sample, normal_sample)

        logger.debug('pipeline provance set to (not_bcbio): {}'.format(
            self.not_bcbio))
        if self.not_bcbio:
            try:
                gender = self.annodf.loc[sample.replace('CR', 'REL').replace(
                    'REL2', 'REL')].gender
            except KeyError:
                gender = None

            try:
                purity = self.annodf.loc[sample].purity
                logger.info('purity: {}'.format(purity))
            except KeyError:
                purity = 100
            logging.info({"gender": gender, "purity": purity})

            calling_method = 'clonal' if purity > 90 else 'threshold'

            cnvkit_called = cnvkit_call(cnvkit_cns,
                                        variants=cnvkit_vaf,
                                        is_sample_female=gender == 'female',
                                        is_reference_male=gender == 'male',
                                        purity=purity / 100,
                                        method=calling_method)

            if time_point != normal_time_point:
                breaks = pd.DataFrame(cnvkit_breaks(
                    cnvkit_cnr, cnvkit_called)).fillna("").replace('nan', '')
                breaks.columns = [
                    'Gene', 'Chrom.', 'Location', 'Change', 'ProbesLeft',
                    'ProbesRight'
                ]
                #print(breaks.query("Gene != ''"))

                gainloss = pd.DataFrame(
                    cnvkit_gainloss(cnvkit_cnr,
                                    segments=cnvkit_called,
                                    male_reference=gender == 'male'))
                #gainloss = pd.DataFrame(cnvkit_gainloss(cnvkit_cnr,min_probes=1, male_reference=gender == 'male'))
                print(cnr_filename.replace('.cnr', '.gainloss'))
                gainloss.to_csv(cnr_filename.replace('.cnr', '.gainloss'),
                                sep="\t",
                                index=None)
        else:
            cnvkit_called = pd.read_table(
                get_log2_ratio_file(self.patient_dir, self.not_bcbio,
                                    sample).replace('.cnr', '-call.cns'),
                dtype={
                    'chromosome': 'str'
                }).loc[lambda df: df.chromosome.str.startswith('GL') == False]
            cnvkit_called = CopyNumArray(cnvkit_called)

        #cnvkit_cnr.write(cnr_filename)
        tabio.write(cnvkit_cnr, cnr_filename)
        #cnvkit_called.write(cnr_filename.replace(".cnr", ".called.cns"))
        tabio.write(cnvkit_called, cnr_filename.replace(".cnr", ".called.cns"))

        metadata_instance = "called"
        logger.info(metadata_instance)
        master_metadata.append(metadata_instance)

        do_plots = True
        if do_plots:
            pylab.rcParams['figure.figsize'] = (25, 8)
            cnvkit_scatterplot(cnarr=cnvkit_cnr,
                               segments=cnvkit_called,
                               variants=cnvkit_vaf,
                               do_trend=True,
                               title=sample)
            savefig(join(self.out_folder, '{}.karyotype.png'.format(sample)))
            pylab.clf()

        #write_df(cnvkit_called.reset_index(), report_file, metadata=master_metadata, index=False)

        return cnvkit_called.data
Пример #25
0
def batch_write_coverage(bed_fname, bam_fname, out_fname, by_count, processes):
    """Run coverage on one sample, write to file."""
    cnarr = coverage.do_coverage(bed_fname, bam_fname, by_count, 0, processes)
    tabio.write(cnarr, out_fname)
    return out_fname
Пример #26
0
    AP_access.add_argument('-l',
                           '--min-length',
                           metavar='TARGET_SIZE',
                           type=int,
                           default=50,
                           help="""Minimum region length to accept as captured.
                    [Default: %(default)s]""")

    args = AP.parse_args()

    # ENH: can we reserve multiple cores for htslib?
    if args.processes < 1:
        args.processes = None

    if args.targets:
        baits = filter_targets(args.targets, args.sample_bams, args.processes,
                               args.fasta)
    else:
        baits = scan_targets(
            args.access,
            args.sample_bams,
            0.5 * args.min_depth,  # More sensitive 1st pass
            args.min_gap,
            args.min_length,
            args.processes)
    baits = normalize_depth_log2_filter(baits, args.min_depth)
    tabio.write(baits, args.output or sys.stdout, 'bed')
    if args.coverage:
        baits['log2'] = np.log2(baits['depth'] / baits['depth'].median())
        tabio.write(baits, args.coverage, 'tab')
Пример #27
0
"""Extract target and antitarget BED files from a CNVkit reference file.

Once you have a stable CNVkit reference for your platform, you can use this
script to drop the "bad" bins from your target and antitarget BED files and
avoid unnecessarily calculating coverage in those bins during future runs.

This script is also useful to recover the target and antitarget BED files that
match the reference if those BED files are missing or you're not sure which ones
are correct.
"""
import argparse
import logging

import cnvlib
from cnvlib import reference
from skgenome import tabio

logging.basicConfig(level=logging.INFO, format="%(message)s")


AP = argparse.ArgumentParser(description=__doc__)
AP.add_argument("reference", help="Reference file.")
AP.add_argument("-o", "--output",
                help="Output base name (extensions added automatically).")
args = AP.parse_args()
ref = cnvlib.read(args.reference)
targets, antitargets = reference.reference2regions(ref)
name = args.output or ref.sample_id
tabio.write(targets, name + '.target.bed', 'bed4')
tabio.write(antitargets, name + '.antitarget.bed', 'bed4')
Пример #28
0
def batch_run_sample(bam_fname, target_bed, antitarget_bed, ref_fname,
                     output_dir, male_reference, plot_scatter, plot_diagram,
                     rscript_path, by_count, skip_low, seq_method,
                     segment_method, processes, do_cluster):
    """Run the pipeline on one BAM file."""
    # ENH - return probes, segments (cnarr, segarr)
    logging.info("Running the CNVkit pipeline on %s ...", bam_fname)
    sample_id = core.fbase(bam_fname)
    sample_pfx = os.path.join(output_dir, sample_id)

    raw_tgt = coverage.do_coverage(target_bed, bam_fname, by_count, 0,
                                   processes)
    tabio.write(raw_tgt, sample_pfx + '.targetcoverage.cnn')

    raw_anti = coverage.do_coverage(antitarget_bed, bam_fname, by_count, 0,
                                    processes)
    tabio.write(raw_anti, sample_pfx + '.antitargetcoverage.cnn')

    cnarr = fix.do_fix(raw_tgt, raw_anti, read_cna(ref_fname),
                       do_gc=True, do_edge=(seq_method == "hybrid"), do_rmask=True,
                       do_cluster=do_cluster)
    tabio.write(cnarr, sample_pfx + '.cnr')

    logging.info("Segmenting %s.cnr ...", sample_pfx)
    segments = segmentation.do_segmentation(cnarr, segment_method,
                                            rscript_path=rscript_path,
                                            skip_low=skip_low,
                                            processes=processes,
                                            **({'threshold': 1e-6}
                                               if seq_method == 'wgs'
                                               else {}))
    logging.info("Post-processing %s.cns ...", sample_pfx)
    # TODO/ENH take centering shift & apply to .cnr for use in segmetrics
    seg_metrics = segmetrics.do_segmetrics(cnarr, segments,
                                           interval_stats=['ci'], alpha=0.5,
                                           smoothed=True)
    tabio.write(seg_metrics, sample_pfx + '.cns')

    # Remove likely false-positive breakpoints
    seg_call = call.do_call(seg_metrics, method="none", filters=['ci'])
    # Calculate another segment-level test p-value
    seg_alltest = segmetrics.do_segmetrics(cnarr, seg_call, location_stats=['p_ttest'])
    # Finally, assign absolute copy number values to each segment
    seg_alltest.center_all("median")
    seg_final = call.do_call(seg_alltest, method="threshold")
    tabio.write(seg_final, sample_pfx + '.call.cns')

    # Test for single-bin CNVs separately
    seg_bintest = bintest.do_bintest(cnarr, seg_call, target_only=True)
    tabio.write(seg_bintest, sample_pfx + '.bintest.cns')

    if plot_scatter:
        scatter.do_scatter(cnarr, seg_final)
        pyplot.savefig(sample_pfx + '-scatter.png', format='png',
                       bbox_inches="tight")
        logging.info("Wrote %s-scatter.png", sample_pfx)

    if plot_diagram:
        is_xx = cnarr.guess_xx(male_reference)
        outfname = sample_pfx + '-diagram.pdf'
        diagram.create_diagram(cnarr.shift_xx(male_reference, is_xx),
                               seg_final.shift_xx(male_reference, is_xx),
                               0.5, 3, outfname)
        logging.info("Wrote %s", outfname)