Пример #1
0
 def _from_intervals(coords):
     garr = GA(
         pd.DataFrame(list(coords),
                      columns=['start', 'end',
                               'gene']).assign(chromosome='chr0'))
     garr.sort_columns()
     return garr
Пример #2
0
def reference2regions(refarr):
    """Split reference into target and antitarget regions."""
    is_bg = (refarr['gene'].isin(params.ANTITARGET_ALIASES))
    regions = GA(refarr.data.loc[:, ('chromosome', 'start', 'end', 'gene')],
                 {'sample_id': 'reference'})
    targets = regions[~is_bg]
    antitargets = regions[is_bg]
    return targets, antitargets
Пример #3
0
def reference2regions(refarr):
    """Split reference into target and antitarget regions."""
    is_bg = (refarr['gene'] == 'Background')
    regions = GA(refarr.data.loc[:, ('chromosome', 'start', 'end', 'gene')],
                 {'sample_id': 'reference'})
    targets = regions[~is_bg]
    antitargets = regions[is_bg]
    return targets, antitargets
Пример #4
0
def scan_targets(access_bed, sample_bams, min_depth, min_gap, min_length,
                 procs):
    """Estimate baited regions from a genome-wide, per-base depth profile."""
    bait_chunks = []
    # ENH: context manager to call rm on bed chunks? with to_chunks as pool, ck?
    logging.info("Scanning for enriched regions in:\n  %s",
                 '\n  '.join(sample_bams))
    #  with futures.ProcessPoolExecutor(procs) as pool:
    with parallel.pick_pool(procs) as pool:
        args_iter = ((bed_chunk, sample_bams, min_depth, min_gap, min_length)
                     for bed_chunk in parallel.to_chunks(access_bed))
        for bed_chunk_fname, bait_chunk in pool.map(_scan_depth, args_iter):
            bait_chunks.append(bait_chunk)
            parallel.rm(bed_chunk_fname)
    baits = GA(pd.concat(bait_chunks))
    baits['depth'] /= len(sample_bams)
    return baits
Пример #5
0
def batch_make_reference(normal_bams, target_bed, antitarget_bed,
                         male_reference, fasta, annotate, short_names,
                         target_avg_size, access_bed, antitarget_avg_size,
                         antitarget_min_size, output_reference, output_dir,
                         processes, by_count, method):
    """Build the CN reference from normal samples, targets and antitargets."""
    if method in ("wgs", "amplicon"):
        if antitarget_bed:
            raise ValueError("%r protocol: antitargets should not be "
                             "given/specified." % method)
        if access_bed and target_bed and access_bed != target_bed:
            raise ValueError("%r protocol: targets and access should not be "
                             "different." % method)

    bait_arr = None
    if method == "wgs":
        if not annotate:
            # TODO check if target_bed has gene names
            logging.warning("WGS protocol: recommend '--annotate' option "
                            "(e.g. refFlat.txt) to help locate genes "
                            "in output files.")
        access_arr = None
        if not target_bed:
            # TODO - drop weird contigs before writing, see antitargets.py
            if access_bed:
                target_bed = access_bed
            elif fasta:
                # Run 'access' on the fly
                access_arr = access.do_access(fasta)
                # Take filename base from FASTA, lacking any other clue
                target_bed = os.path.splitext(
                    os.path.basename(fasta))[0] + ".bed"
                tabio.write(access_arr, target_bed, "bed3")
            else:
                raise ValueError("WGS protocol: need to provide --targets, "
                                 "--access, or --fasta options.")

        # Tweak default parameters
        if not target_avg_size:
            if normal_bams:
                # Calculate bin size from .bai & access
                if fasta and not access_arr:
                    # Calculate wgs depth from all
                    # sequencing-accessible area (it doesn't take that long
                    # compared to WGS coverage); user-provided access might be
                    # something else that excludes a significant number of
                    # mapped reads.
                    access_arr = access.do_access(fasta)
                if access_arr:
                    autobin_args = ['wgs', access_arr]
                else:
                    # Don't assume the given targets/access covers the whole
                    # genome; use autobin sampling to estimate bin size, as we
                    # do for amplicon
                    bait_arr = tabio.read_auto(target_bed)
                    autobin_args = ['amplicon', bait_arr]
                # Choose median-size normal bam or tumor bam
                bam_fname = autobin.midsize_file(normal_bams)
                (wgs_depth,
                 target_avg_size), _ = autobin.do_autobin(bam_fname,
                                                          *autobin_args,
                                                          bp_per_bin=50000.)
                logging.info("WGS average depth %.2f --> using bin size %d",
                             wgs_depth, target_avg_size)
            else:
                # This bin size is OK down to 10x
                target_avg_size = 5000

    # To make temporary filenames for processed targets or antitargets
    tgt_name_base, _tgt_ext = os.path.splitext(os.path.basename(target_bed))
    if output_dir:
        tgt_name_base = os.path.join(output_dir, tgt_name_base)

    # Pre-process baits/targets
    new_target_fname = tgt_name_base + '.target.bed'
    if bait_arr is None:
        bait_arr = tabio.read_auto(target_bed)
    target_arr = target.do_target(
        bait_arr, annotate, short_names, True,
        **({
            'avg_size': target_avg_size
        } if target_avg_size else {}))
    tabio.write(target_arr, new_target_fname, 'bed4')
    target_bed = new_target_fname

    if not antitarget_bed:
        # Devise a temporary antitarget filename
        antitarget_bed = tgt_name_base + '.antitarget.bed'
        if method == "hybrid":
            # Build antitarget BED from the given targets
            anti_kwargs = {}
            if access_bed:
                anti_kwargs['access'] = tabio.read_auto(access_bed)
            if antitarget_avg_size:
                anti_kwargs['avg_bin_size'] = antitarget_avg_size
            if antitarget_min_size:
                anti_kwargs['min_bin_size'] = antitarget_min_size
            anti_arr = antitarget.do_antitarget(target_arr, **anti_kwargs)
        else:
            # No antitargets for wgs, amplicon
            anti_arr = GA([])
        tabio.write(anti_arr, antitarget_bed, "bed4")

    if len(normal_bams) == 0:
        logging.info("Building a flat reference...")
        ref_arr = reference.do_reference_flat(target_bed, antitarget_bed,
                                              fasta, male_reference)
    else:
        logging.info("Building a copy number reference from normal samples...")
        # Run coverage on all normals
        with parallel.pick_pool(processes) as pool:
            tgt_futures = []
            anti_futures = []
            procs_per_cnn = max(1, processes // (2 * len(normal_bams)))
            for nbam in normal_bams:
                sample_id = core.fbase(nbam)
                sample_pfx = os.path.join(output_dir, sample_id)
                tgt_futures.append(
                    pool.submit(batch_write_coverage, target_bed, nbam,
                                sample_pfx + '.targetcoverage.cnn', by_count,
                                procs_per_cnn))
                anti_futures.append(
                    pool.submit(batch_write_coverage, antitarget_bed, nbam,
                                sample_pfx + '.antitargetcoverage.cnn',
                                by_count, procs_per_cnn))

        target_fnames = [tf.result() for tf in tgt_futures]
        antitarget_fnames = [af.result() for af in anti_futures]
        # Build reference from *.cnn
        ref_arr = reference.do_reference(target_fnames,
                                         antitarget_fnames,
                                         fasta,
                                         male_reference,
                                         None,
                                         do_gc=True,
                                         do_edge=(method == "hybrid"),
                                         do_rmask=True)
    if not output_reference:
        output_reference = os.path.join(output_dir, "reference.cnn")
    core.ensure_path(output_reference)
    tabio.write(ref_arr, output_reference)
    return output_reference, target_bed, antitarget_bed
Пример #6
0
def idxstats2ga(table, bam_fname):
    return GA(table.assign(start=0, end=table.length)
              .loc[:, ('chromosome', 'start', 'end')],
              meta_dict={'filename': bam_fname})
Пример #7
0
def idxstats2ga(table):
    return GA(
        table.assign(start=0,
                     end=table.length).loc[:, ('chromosome', 'start', 'end')])
Пример #8
0
        '-g',
        '--gene-resource',
        metavar="FILE",
        required=True,
        # default="data/ensembl-gene-info.hg38.tsv",
        help="Ensembl BioMart-derived gene info table.")
    AP.add_argument('-d',
                    '--output-dir',
                    metavar='PATH',
                    default='.',
                    help="Output directory.")

    args = AP.parse_args()
    gene_info = load_gene_info(args.gene_resource, None, None)
    bad_genes = [
        'Metazoa_SRP', '5S_rRNA', 'Y_RNA', 'U1', 'U2', 'U3', 'U4', 'U5', 'U6',
        'U7', 'U8', 'uc_338', 'Clostridiales-1'
    ]
    gene_info = gene_info[~gene_info['gene'].isin(bad_genes)]
    gene_info = GA(gene_info.loc[:, ('chromosome', 'start', 'end', 'gene')])

    for seg_fname in args.seg_files:
        seg = tabio.read(seg_fname, 'seg')
        # Assign gene names to segments using genomic coordinates from gene_info
        seg['gene'] = genes_in_segments(seg, gene_info)

        outfname = os.path.join(args.output_dir,
                                basename(seg_fname) + ".acgh.cns")
        tabio.write(seg, outfname, 'tab')
        print("Wrote", outfname)