Пример #1
0
def get_contigs(data):
    contigs = [x.name for x in shared.get_noalt_contigs(data)]
    keep = [
        x for x in contigs
        if chromhacks.is_autosomal(x) or chromhacks.is_sex(x)
    ]
    return keep
Пример #2
0
def _get_autosomal_bed(data, base_file):
    out_file = "%s-stdchroms.bed" % utils.splitext_plus(base_file)[0]
    with open(out_file, "w") as out_handle:
        for r in ref.file_contigs(dd.get_ref_file(data)):
            if chromhacks.is_autosomal(r.name):
                out_handle.write("%s\t0\t%s\n" % (r.name, r.size))
    return out_file
Пример #3
0
def prep_vrn_file(in_file, vcaller, work_dir, somatic_info, writer_class, seg_file=None, params=None):
    """Select heterozygous variants in the normal sample with sufficient depth.

    writer_class implements write_header and write_row to write VCF outputs
    from a record and extracted tumor/normal statistics.
    """
    data = somatic_info.tumor_data
    if not params:
        params = PARAMS
    out_file = os.path.join(work_dir, "%s-%s-prep.csv" % (utils.splitext_plus(os.path.basename(in_file))[0],
                                                          vcaller))
    if not utils.file_uptodate(out_file, in_file):
        # ready_bed = _identify_heterogeneity_blocks_seg(in_file, seg_file, params, work_dir, somatic_info)
        ready_bed = None
        if ready_bed and utils.file_exists(ready_bed):
            sub_file = _create_subset_file(in_file, ready_bed, work_dir, data)
        else:
            sub_file = in_file
        max_depth = max_normal_germline_depth(sub_file, params, somatic_info)
        with file_transaction(data, out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                writer = writer_class(out_handle)
                writer.write_header()
                bcf_in = pysam.VariantFile(sub_file)
                for rec in bcf_in:
                    stats = _is_possible_loh(rec, bcf_in, params, somatic_info, max_normal_depth=max_depth)
                    if chromhacks.is_autosomal(rec.chrom) and stats is not None:
                        writer.write_row(rec, stats)
    return out_file
Пример #4
0
def _prep_vrn_file(in_file, vcaller, seg_file, work_dir, somatic_info):
    """Select heterozygous variants in the normal sample with sufficient depth.
    """
    data = somatic_info.tumor_data
    params = {"min_freq": 0.4,
              "max_freq": 0.6,
              "tumor_only": {"min_freq": 0.10, "max_freq": 0.90},
              "min_depth": 20,
              "hetblock": {"min_alleles": 25,
                           "allowed_misses": 2}}
    out_file = os.path.join(work_dir, "%s-%s-prep.csv" % (utils.splitext_plus(os.path.basename(in_file))[0],
                                                          vcaller))
    if not utils.file_uptodate(out_file, in_file):
        #ready_bed = _identify_heterogeneity_blocks_seg(in_file, seg_file, params, work_dir, somatic_info)
        ready_bed = None
        if ready_bed and utils.file_exists(ready_bed):
            sub_file = _create_subset_file(in_file, ready_bed, work_dir, data)
        else:
            sub_file = in_file
        with file_transaction(data, out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                writer = csv.writer(out_handle)
                writer.writerow(["chrom", "start", "end", "freq"])
                bcf_in = pysam.VariantFile(sub_file)
                for rec in bcf_in:
                    tumor_freq = _is_possible_loh(rec, bcf_in, params, somatic_info)
                    if chromhacks.is_autosomal(rec.chrom) and tumor_freq is not None:
                        writer.writerow([_to_ucsc_style(rec.chrom), rec.start, rec.stop, tumor_freq])
    return out_file
Пример #5
0
def _prep_cnv_file(cns_file, svcaller, work_dir, data):
    """Create a CSV file of CNV calls with log2 and number of marks.
    """
    in_file = cns_file
    out_file = os.path.join(
        work_dir, "%s-%s-prep.csv" %
        (utils.splitext_plus(os.path.basename(in_file))[0], svcaller))
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            with open(in_file) as in_handle:
                with open(tx_out_file, "w") as out_handle:
                    reader = csv.reader(in_handle, dialect="excel-tab")
                    writer = csv.writer(out_handle)
                    writer.writerow(
                        ["chrom", "start", "end", "num.mark", "seg.mean"])
                    header = next(reader)
                    for line in reader:
                        cur = dict(zip(header, line))
                        if chromhacks.is_autosomal(cur["chromosome"]):
                            writer.writerow([
                                _to_ucsc_style(cur["chromosome"]),
                                cur["start"], cur["end"], cur["probes"],
                                cur["log2"]
                            ])
    return out_file
Пример #6
0
def _freqs_by_chromosome(in_file, params, somatic_info):
    """Retrieve frequencies across each chromosome as inputs to HMM.
    """
    freqs = []
    coords = []
    cur_chrom = None
    with pysam.VariantFile(in_file) as bcf_in:
        for rec in bcf_in:
            if _is_biallelic_snp(rec) and _passes_plus_germline(
                    rec) and chromhacks.is_autosomal(rec.chrom):
                if cur_chrom is None or rec.chrom != cur_chrom:
                    if cur_chrom and len(freqs) > 0:
                        yield cur_chrom, freqs, coords
                    cur_chrom = rec.chrom
                    freqs = []
                    coords = []
                stats = _tumor_normal_stats(rec, somatic_info)
                if tz.get_in(["tumor", "depth"], stats,
                             0) > params["min_depth"]:
                    # not a ref only call
                    if len(rec.samples) == 0 or sum(rec.samples[
                            somatic_info.tumor_name].allele_indices) > 0:
                        freqs.append(tz.get_in(["tumor", "freq"], stats))
                        coords.append(rec.start)
        if cur_chrom and len(freqs) > 0:
            yield cur_chrom, freqs, coords
Пример #7
0
def prep_vrn_file(in_file, vcaller, work_dir, somatic_info, writer_class, seg_file=None, params=None):
    """Select heterozygous variants in the normal sample with sufficient depth.

    writer_class implements write_header and write_row to write VCF outputs
    from a record and extracted tumor/normal statistics.
    """
    data = somatic_info.tumor_data
    if not params:
        params = PARAMS
    out_file = os.path.join(work_dir, "%s-%s-prep.csv" % (utils.splitext_plus(os.path.basename(in_file))[0],
                                                          vcaller))
    if not utils.file_uptodate(out_file, in_file):
        # ready_bed = _identify_heterogeneity_blocks_seg(in_file, seg_file, params, work_dir, somatic_info)
        ready_bed = None
        if ready_bed and utils.file_exists(ready_bed):
            sub_file = _create_subset_file(in_file, ready_bed, work_dir, data)
        else:
            sub_file = in_file
        with file_transaction(data, out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                writer = writer_class(out_handle)
                writer.write_header()
                bcf_in = pysam.VariantFile(sub_file)
                for rec in bcf_in:
                    stats = _is_possible_loh(rec, bcf_in, params, somatic_info)
                    if chromhacks.is_autosomal(rec.chrom) and stats is not None:
                        writer.write_row(rec, stats)
    return out_file
Пример #8
0
def _prep_cnv_file(cns_file, svcaller, calls_by_name, work_dir, data):
    """Create a CSV file of CNV calls with log2 and number of marks.
    """
    in_file = theta.subset_by_supported(cns_file,
                                        _cns_to_coords,
                                        calls_by_name,
                                        work_dir,
                                        data,
                                        headers=("chromosome", "#"))
    out_file = os.path.join(
        work_dir, "%s-%s-prep.csv" %
        (utils.splitext_plus(os.path.basename(in_file))[0], svcaller))
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            with open(in_file) as in_handle:
                with open(tx_out_file, "w") as out_handle:
                    reader = csv.reader(in_handle, dialect="excel-tab")
                    writer = csv.writer(out_handle)
                    writer.writerow(
                        ["chrom", "start", "end", "num.mark", "seg.mean"])
                    reader.next()  # header
                    for chrom, start, end, _, log2, probes in reader:
                        if chromhacks.is_autosomal(chrom):
                            writer.writerow([
                                _to_ucsc_style(chrom), start, end, probes, log2
                            ])
    return out_file
Пример #9
0
def _prep_vrn_file(in_file, vcaller, work_dir, calls_by_name, somatic_info):
    """Select heterozygous variants in the normal sample with sufficient depth.
    """
    data = somatic_info.tumor_data
    params = {
        "min_freq": 0.4,
        "max_freq": 0.6,
        "min_depth": 15,
        "hetblock": {
            "min_alleles": 25,
            "allowed_misses": 2
        }
    }
    out_file = os.path.join(
        work_dir, "%s-%s-prep.csv" %
        (utils.splitext_plus(os.path.basename(in_file))[0], vcaller))
    if not utils.file_uptodate(out_file, in_file):
        ready_bed = _identify_heterogenity_blocks(in_file, params, work_dir,
                                                  somatic_info)
        #ready_bed = _remove_sv_calls(het_bed, calls_by_name, somatic_info.tumor_data)
        sub_file = _create_subset_file(in_file, ready_bed, work_dir, data)
        with file_transaction(data, out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                writer = csv.writer(out_handle)
                writer.writerow(["chrom", "start", "end", "freq"])
                bcf_in = pysam.VariantFile(sub_file)
                for rec in bcf_in:
                    tumor_freq = _is_possible_loh(rec, params, somatic_info)
                    if chromhacks.is_autosomal(
                            rec.chrom) and tumor_freq is not None:
                        writer.writerow([
                            _to_ucsc_style(rec.chrom), rec.start, rec.stop,
                            tumor_freq
                        ])
    return out_file
Пример #10
0
def coverage_interval_from_bed(bed_file, per_chrom=True):
    """Calculate a coverage interval for the current region BED.

    This helps correctly work with cases of uneven coverage across an analysis
    genome. strelka2 and other model based callers have flags for targeted and non
    which depend on the local context.

    Checks coverage per chromosome, avoiding non-standard chromosomes, if per_chrom is set.
    Otherwise does a global check over all regions. The global check performs better for
    strelka2 but not for DeepVariant:

    https://github.com/bcbio/bcbio_validations/tree/master/deepvariant#deepvariant-v06-release-strelka2-stratification-and-initial-gatk-cnn
    """
    total_starts = {}
    total_ends = {}
    bed_bases = collections.defaultdict(int)
    with utils.open_gzipsafe(bed_file) as in_handle:
        for line in in_handle:
            parts = line.split()
            if len(parts) >= 3:
                chrom, start, end = parts[:3]
                if chromhacks.is_autosomal(chrom):
                    start = int(start)
                    end = int(end)
                    bed_bases[chrom] += (end - start)
                    total_starts[chrom] = min(
                        [start, total_starts.get(chrom, sys.maxsize)])
                    total_ends[chrom] = max([end, total_ends.get(chrom, 0)])
    # can check per chromosome -- any one chromosome with larger, or over all regions
    if per_chrom:
        freqs = [
            float(bed_bases[c]) / float(total_ends[c] - total_starts[c])
            for c in sorted(bed_bases.keys())
        ]
    elif len(bed_bases) > 0:
        freqs = [
            sum([bed_bases[c] for c in sorted(bed_bases.keys())]) / sum([
                float(total_ends[c] - total_starts[c])
                for c in sorted(bed_bases.keys())
            ])
        ]
    else:
        freqs = []
    # Should be importing GENOME_COV_THRESH but get circular imports
    if any([f >= 0.40 for f in freqs]):
        return "genome"
    else:
        return "targeted"
Пример #11
0
def _prep_cnv_file(cns_file, svcaller, work_dir, data):
    """Create a CSV file of CNV calls with log2 and number of marks.
    """
    in_file = cns_file
    out_file = os.path.join(work_dir, "%s-%s-prep.csv" % (utils.splitext_plus(os.path.basename(in_file))[0],
                                                          svcaller))
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            with open(in_file) as in_handle:
                with open(tx_out_file, "w") as out_handle:
                    reader = csv.reader(in_handle, dialect="excel-tab")
                    writer = csv.writer(out_handle)
                    writer.writerow(["chrom", "start", "end", "num.mark", "seg.mean"])
                    reader.next()  # header
                    for chrom, start, end, _, log2, probes in (xs[:6] for xs in reader):
                        if chromhacks.is_autosomal(chrom):
                            writer.writerow([_to_ucsc_style(chrom), start, end, probes, log2])
    return out_file
Пример #12
0
def _freqs_by_chromosome(in_file, params, somatic_info):
    """Retrieve frequencies across each chromosome as inputs to HMM.
    """
    chroms = []
    freqs = []
    coords = []
    with pysam.VariantFile(in_file) as bcf_in:
        for rec in bcf_in:
            if _is_biallelic_snp(rec) and chromhacks.is_autosomal(rec.chrom):
                if len(chroms) == 0 or rec.chrom != chroms[-1]:
                    chroms.append(rec.chrom)
                    freqs.append([])
                    coords.append([])
                stats = _tumor_normal_stats(rec, somatic_info)
                if tz.get_in(["tumor", "depth"], stats, 0) > params["min_depth"]:
                    # not a ref only call
                    if sum(rec.samples[somatic_info.tumor_name].allele_indices) > 0:
                        freqs[-1].append(tz.get_in(["tumor", "freq"], stats))
                        coords[-1].append(rec.start)
    return chroms, freqs, coords
Пример #13
0
def _prep_cnv_file(cns_file, svcaller, work_dir, data):
    """Create a CSV file of CNV calls with log2 and number of marks.
    """
    in_file = cns_file
    out_file = os.path.join(work_dir, "%s-%s-prep.csv" % (utils.splitext_plus(os.path.basename(in_file))[0],
                                                          svcaller))
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            with open(in_file) as in_handle:
                with open(tx_out_file, "w") as out_handle:
                    reader = csv.reader(in_handle, dialect="excel-tab")
                    writer = csv.writer(out_handle)
                    writer.writerow(["chrom", "start", "end", "num.mark", "seg.mean"])
                    header = next(reader)
                    for line in reader:
                        cur = dict(zip(header, line))
                        if chromhacks.is_autosomal(cur["chromosome"]):
                            writer.writerow([_to_ucsc_style(cur["chromosome"]), cur["start"],
                                             cur["end"], cur["probes"], cur["log2"]])
    return out_file
Пример #14
0
def _prep_vrn_file(in_file, vcaller, work_dir, somatic_info):
    """Select heterozygous variants in the normal sample with sufficient depth.
    """
    data = somatic_info.tumor_data
    params = {"min_freq": 0.4,
              "max_freq": 0.6,
              "min_depth": 15}
    out_file = os.path.join(work_dir, "%s-%s-prep.csv" % (utils.splitext_plus(os.path.basename(in_file))[0],
                                                          vcaller))
    if not utils.file_uptodate(out_file, in_file):
        sub_file = _create_subset_file(in_file, work_dir, data)
        with file_transaction(data, out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                writer = csv.writer(out_handle)
                writer.writerow(["chrom", "start", "end", "freq"])
                bcf_in = VariantFile(sub_file)
                for rec in bcf_in:
                    tumor_freq = _is_possible_loh(rec, params, somatic_info)
                    if chromhacks.is_autosomal(rec.chrom) and tumor_freq is not None:
                        writer.writerow([_to_ucsc_style(rec.chrom), rec.start, rec.stop, tumor_freq])
    return out_file
Пример #15
0
def coverage_interval_from_bed(bed_file, per_chrom=True):
    """Calculate a coverage interval for the current region BED.

    This helps correctly work with cases of uneven coverage across an analysis
    genome. strelka2 and other model based callers have flags for targeted and non
    which depend on the local context.

    Checks coverage per chromosome, avoiding non-standard chromosomes, if per_chrom is set.
    Otherwise does a global check over all regions. The global check performs better for
    strelka2 but not for DeepVariant:

    https://github.com/bcbio/bcbio_validations/tree/master/deepvariant#deepvariant-v06-release-strelka2-stratification-and-initial-gatk-cnn
    """
    total_starts = {}
    total_ends = {}
    bed_bases = collections.defaultdict(int)
    with utils.open_gzipsafe(bed_file) as in_handle:
        for line in in_handle:
            parts = line.split()
            if len(parts) >= 3:
                chrom, start, end = parts[:3]
                if chromhacks.is_autosomal(chrom):
                    start = int(start)
                    end = int(end)
                    bed_bases[chrom] += (end - start)
                    total_starts[chrom] = min([start, total_starts.get(chrom, sys.maxsize)])
                    total_ends[chrom] = max([end, total_ends.get(chrom, 0)])
    # can check per chromosome -- any one chromosome with larger, or over all regions
    if per_chrom:
        freqs = [float(bed_bases[c]) / float(total_ends[c] - total_starts[c]) for c in sorted(bed_bases.keys())]
    elif len(bed_bases) > 0:
        freqs = [sum([bed_bases[c] for c in sorted(bed_bases.keys())]) /
                 sum([float(total_ends[c] - total_starts[c]) for c in sorted(bed_bases.keys())])]
    else:
        freqs = []
    # Should be importing GENOME_COV_THRESH but get circular imports
    if any([f >= 0.40 for f in freqs]):
        return "genome"
    else:
        return "targeted"
Пример #16
0
def _freqs_by_chromosome(in_file, params, somatic_info):
    """Retrieve frequencies across each chromosome as inputs to HMM.
    """
    chroms = []
    freqs = []
    coords = []
    with pysam.VariantFile(in_file) as bcf_in:
        for rec in bcf_in:
            if _is_biallelic_snp(rec) and chromhacks.is_autosomal(rec.chrom):
                if len(chroms) == 0 or rec.chrom != chroms[-1]:
                    chroms.append(rec.chrom)
                    freqs.append([])
                    coords.append([])
                stats = _tumor_normal_stats(rec, somatic_info)
                if tz.get_in(["normal", "depth"], stats,
                             0) > params["min_depth"]:
                    # not a ref only call
                    if sum(rec.samples[
                            somatic_info.tumor_name].allele_indices) > 0:
                        freqs[-1].append(tz.get_in(["normal", "freq"], stats))
                        coords[-1].append(rec.start)
    return chroms, freqs, coords
Пример #17
0
def _freqs_by_chromosome(in_file, params, somatic_info):
    """Retrieve frequencies across each chromosome as inputs to HMM.
    """
    freqs = []
    coords = []
    cur_chrom = None
    with pysam.VariantFile(in_file) as bcf_in:
        for rec in bcf_in:
            if _is_biallelic_snp(rec) and _passes_plus_germline(rec) and chromhacks.is_autosomal(rec.chrom):
                if cur_chrom is None or rec.chrom != cur_chrom:
                    if cur_chrom and len(freqs) > 0:
                        yield cur_chrom, freqs, coords
                    cur_chrom = rec.chrom
                    freqs = []
                    coords = []
                stats = _tumor_normal_stats(rec, somatic_info)
                if tz.get_in(["tumor", "depth"], stats, 0) > params["min_depth"]:
                    # not a ref only call
                    if len(rec.samples) == 0 or sum(rec.samples[somatic_info.tumor_name].allele_indices) > 0:
                        freqs.append(tz.get_in(["tumor", "freq"], stats))
                        coords.append(rec.start)
        if cur_chrom and len(freqs) > 0:
            yield cur_chrom, freqs, coords