示例#1
0
    def make_record(self):
        self.vcf_record = self.records[0].copy()
        called_samples = set(svu.get_called_samples(self.vcf_record))

        # Take union of called samples
        for record in itertools.islice(self.records, 1, None):
            cs = svu.get_called_samples(record)
            for sample in cs:
                if sample not in called_samples:
                    self.vcf_record.samples[sample]['GT'] = (0, 1)
                    called_samples.add(sample)
def filter_dn_variants(vcf, metrics, fout):
    CNV = 'DEL DUP'.split()
    is_cnv = metrics.svtype.isin(CNV)
    depth_only = metrics.sources == 'depth'
    pesr_size_filter = (metrics.svsize >= 1000)

    passing = ((depth_only & is_cnv & metrics.rd_pass) |
               (~depth_only & is_cnv & pesr_size_filter & metrics.rd_pass) |
               (~depth_only & is_cnv & ~pesr_size_filter & metrics.pesr_pass) |
               (~is_cnv & metrics.pesr_pass))

    def _join_samples(s):
        return sorted(set(s))

    passes = metrics.loc[passing].groupby('name')['sample'].agg(_join_samples)
    fails = metrics.loc[~passing].groupby('name')['sample'].agg(_join_samples)

    checked_variants = metrics.name.unique()
    for record in vcf:
        # Write records unaltered if they weren't included in the de novo check
        if record.id not in checked_variants:
            fout.write(record)
        # Otherwise set samples appropriately
        else:
            pass_samples = passes.get(record.id, [])
            for sample in pass_samples:
                record.samples[sample]['GT'] = (0, 1)

            fail_samples = fails.get(record.id, [])
            for sample in fail_samples:
                set_null(record, sample)

            # Only report record if any samples made it through de novo check
            if len(svu.get_called_samples(record)) > 0:
                fout.write(record)
示例#3
0
    def choose_background(self, record, whitelist=None, blacklist=None):
        # Select called and background samples

        called = svu.get_called_samples(record)
        background = [s for s in self.samples if s not in called]
        # Permit override of specified white/blacklists
        whitelist = whitelist if whitelist is not None else self.whitelist
        blacklist = blacklist if blacklist is not None else self.blacklist

        def _filter_whitelist(samples):
            return [s for s in samples if s in whitelist]

        def _filter_blacklist(samples):
            return [s for s in samples if s not in blacklist]

        called = _filter_whitelist(called)
        background = _filter_whitelist(background)

        called = _filter_blacklist(called)
        background = _filter_blacklist(background)

        if len(background) >= self.n_background:
            background = np.random.choice(background,
                                          self.n_background,
                                          replace=False).tolist()
        return called, background
示例#4
0
def process_metadata(variants, bed=False, batch_list=None):
    if bed:
        samples = [s.strip() for s in batch_list.readlines()]
    else:
        samples = list(variants.header.samples)

    parents = [s for s in samples if _is_parent(s)]
    children = [s for s in samples if _is_child(s)]
    n_parents = len(parents)
    n_children = len(children)

    metadata = deque()
    for variant in variants:
        # bed record
        if bed:
            if variant.startswith('#'):
                continue
            data = variant.strip().split()
            called = data[4].split(',')
            name = data[3]
            svtype = data[5]
        # VCF record
        else:
            called = svu.get_called_samples(variant)
            name = variant.id
            svtype = variant.info['SVTYPE']

        # Calculate parental VF
        parents = [s for s in called if _is_parent(s)]
        if n_parents > 0:
            parental_vf = len(parents) / n_parents
        else:
            parental_vf = 0
        children = [s for s in called if _is_child(s)]
        if n_children > 0:
            child_vf = len(children) / n_children
        else:
            child_vf = 0

        if child_vf > 0:
            inh_rate = get_inh_rate(called)
        else:
            inh_rate = 0

        dat = [name, svtype, parental_vf, child_vf, inh_rate]
        metadata.append(dat)

    metadata = np.array(metadata)
    cols = 'name svtype parental_vf child_vf inh_rate'.split()
    metadata = pd.DataFrame(metadata, columns=cols)
    return metadata
示例#5
0
def samples_overlap(recA, recB, upper_thresh=0.8, lower_thresh=0.5):
    """
    Report if the samples called in two VCF records overlap sufficiently.

    The fraction of each record's samples which are shared with the other
    record is calculated. The record with a greater fraction of shared samples
    must exceed the upper threshold AND the record with a lesser fraction of
    shared samples must exceed the lower threshold. This is intended to
    maximize sensitivity in rare variants with a false negative in one
    breakpoint.

    Parameters
    ----------
    recA : pysam.VariantRecord
    recB : pysam.VariantRecord
    upper_thresh : float, optional
        Minimum sample overlap in record with greater overlap
    lower_thresh : float, optional
        Minimum sample overlap in record with lesser overlap

    Returns
    -------
    samples_overlap : bool
        Samples shared between records meet required thresholds.
    """

    # Get lists of called samples for each record
    samplesA = set(svu.get_called_samples(recA))
    samplesB = set(svu.get_called_samples(recB))

    # Compute fraction of each record's samples which are shared
    shared = samplesA & samplesB
    fracA = len(shared) / len(samplesA)
    fracB = len(shared) / len(samplesB)

    min_frac, max_frac = sorted([fracA, fracB])

    return min_frac >= lower_thresh and max_frac >= upper_thresh
示例#6
0
def dn_test(record, parents, config):
    called = svu.get_called_samples(record)

    for i, parent in enumerate(parents):
        others = parents[:i] + parents[i + 1:]
        blacklist = called + others
        samples = record.samples.keys()
        whitelist = [s for s in samples if s not in blacklist]

        # PE Test
        pe = PEBreakpoint.from_vcf(record)
        pe.samples = [parent]

        pe.pe_test(whitelist,
                   config.discfile,
                   n_background=160,
                   window_in=50,
                   window_out=500)
        stats = pe.stats
        stats['name'] = pe.name
        stats['sample'] = parent
        cols = 'name sample log_pval called_median bg_median'.split()
        stats[cols].to_csv(config.petest,
                           sep='\t',
                           index=False,
                           header=False,
                           na_rep='NA')

        # SR Test
        sr = SRBreakpoint.from_vcf(record)
        sr.samples = [parent]

        sr.sr_test(whitelist, config.countfile, n_background=160, window=50)

        pvals = sr.best_pvals
        pvals['sample'] = parent
        cols = 'name sample coord pos log_pval called_median bg_median'.split()
        pvals = pvals[cols].fillna(0)

        int_cols = ['pos']  # called_median bg_median'.split()
        for col in int_cols:
            pvals[col] = pvals[col].round().astype(int)
        pvals.log_pval = np.abs(pvals.log_pval)

        pvals.to_csv(config.srtest,
                     sep='\t',
                     index=False,
                     header=False,
                     na_rep='NA')
示例#7
0
    def from_vcf(cls, record):
        """
        Parameters
        ----------
        record : pysam.VariantRecord
        """

        chrA = record.chrom
        posA = record.pos
        chrB = record.info['CHR2']
        posB = record.stop

        name = record.id
        strands = record.info['STRANDS']

        samples = svu.get_called_samples(record)

        return cls(chrA, posA, chrB, posB, name, samples, strands)
示例#8
0
def count_svtypes(vcf):
    """
    Count instances of each SVTYPE in each sample in a VCF.

    Parameters
    ----------
    vcf : pysam.VariantFile

    Returns
    -------
    counts : pd.DataFrame
        Columns: sample, svtype, count
    """

    samples = list(vcf.header.samples)

    # Initialize counts per sample - each dict is keyed on svtype
    count_dict = {}
    for sample in samples:
        count_dict[sample] = defaultdict(int)

    for record in vcf:
        for sample in svu.get_called_samples(record):
            # Count the SVTYPE if it's present, otherwise increment NO_SVTYPE
            if 'SVTYPE' in record.info.keys():
                count_dict[sample][record.info['SVTYPE']] += 1
            else:
                count_dict[sample]['NO_SVTYPE'] += 1

    # Convert to dataframe, adding zeros to samples with no instances of a
    # given svtype
    counts = pd.DataFrame.from_dict(count_dict, orient='index')\
                         .fillna(0).astype(int)\
                         .reset_index().rename(columns={'index': 'sample'})

    # Tidy data from "column-per-svtype" format
    counts = pd.melt(counts,
                     id_vars=['sample'],
                     var_name='svtype',
                     value_name='count')

    return counts