Пример #1
0
def filter_missing_indels(candidates, *args):
    """ filter the candidate missing indels.
    
    We have a set of sites that have been called in the child, but not in the
    parents. These are the candidates for de novo mutations. Many of these sites
    have been checked by denovogear, for which we have a different filtering
    process. This function filters candidate sites which have not been examined
    by denovogear. Denovogear has a reduced sensitivity for indels, so this
    filtering only examines candidate indel sites not examined by denovogear.
    
    Args:
        candidates: pandas dataframe of de novo indel sites
    
    Returns:
        dataframe of candidate sites that pass the required criteria.
    """
    
    counts = extract_alt_and_ref_counts(candidates)
    depths = get_depths_and_proportions(counts)
    
    depths["min_parent_depth"] = depths[["mom_depth", "dad_depth"]].min(axis=1)
    depths["max_parent_proportion"] = depths[["mom_prp", "dad_prp"]].max(axis=1)
    
    # apply the filtering criteria for the missing indels
    good_depth = depths["child_alts"] > 2
    low_parental_alt = counts["min_parent_alt"] < 2
    good_parental_depth = depths["min_parent_depth"] > 7
    good_parental_proportion = depths["max_parent_proportion"] < 0.1
    good_child_proportion = depths["child_prp"] > 0.2
    
    return good_depth & low_parental_alt & good_parental_depth & \
        good_parental_proportion & good_child_proportion
def filter_denovogear_sites(de_novos, status):
    """ set flags for filtering, fail samples with strand bias < threshold, or any 2 of
     (i) both parents have ALTs
     (ii) site-specific parental alts < threshold,
     (iii) gene-specific parental alts < threshold, if > 1 sites called in gene
    
    Args:
        de_novos: dataframe of de novo variants
        status: list (or pandas Series) or boolea
    
    Returns:
        vector of true/false for whether each variant passes the filters
    """
    
    counts = extract_alt_and_ref_counts(de_novos)
    recurrent = get_recurrent_genes(de_novos)
    
    # check if sites deviate from expected strand bias and parental alt depths
    strand_bias, parental_site_bias = test_sites(counts, status)
    parental_gene_bias = test_genes(counts, strand_bias, status)
    
    # fail SNVs with excessive strand bias. Don't check strand bias in indels.
    overall_pass = (strand_bias >= P_CUTOFF) | (de_novos["ref"].str.len() != 1) | \
        (de_novos["alt"].str.len() != 1)
    
    # find if each de novo has passed each of three different filtering strategies
    # fail sites with gene-specific parental alts, only if >1 sites called per gene
    gene_fail = (parental_gene_bias < P_CUTOFF) & de_novos["symbol"].isin(recurrent)
    site_fail = (parental_site_bias < P_CUTOFF)
    
    counts['dad_depth'] = counts[['father_ref_F', 'father_ref_R', 'father_alt_F', 'father_alt_R']].sum(axis=1)
    counts['mom_depth'] = counts[['mother_ref_F', 'mother_ref_R', 'mother_alt_F', 'mother_alt_R']].sum(axis=1)
    counts['parental_depth_threshold'] = counts[['dad_depth', 'mom_depth']].apply(min_depth, error=ERROR_RATE, axis=1)
    
    excess_alts = counts["min_parent_alt"] > counts['parental_depth_threshold']
    
    # exclude sites that fail two of three classes
    sites = pandas.DataFrame({"gene": gene_fail, "site": site_fail, "alts": excess_alts})
    overall_pass[sites.sum(axis=1) >= 2] = False
    
    return overall_pass
 def test_extract_alt_and_ref_counts(self):
     ''' check that counting alleles from DP4 entries works correctly
     '''
     
     self.compare_tables(extract_alt_and_ref_counts(self.variants), self.counts)
Пример #4
0
    def test_extract_alt_and_ref_counts(self):
        ''' check that counting alleles from DP4 entries works correctly
        '''

        self.compare_tables(extract_alt_and_ref_counts(self.variants),
                            self.counts)
def filter_denovogear_sites(de_novos, status):
    """ set flags for filtering, fail samples with strand bias < threshold, or any 2 of
     (i) both parents have ALTs
     (ii) site-specific parental alts < threshold,
     (iii) gene-specific parental alts < threshold, if > 1 sites called in gene
    
    Args:
        de_novos: dataframe of de novo variants
        status: list (or pandas Series) or boolea
    
    Returns:
        vector of true/false for whether each variant passes the filters
    """

    counts = extract_alt_and_ref_counts(de_novos)
    recurrent = get_recurrent_genes(de_novos)

    counts['child_alts'] = counts[['child_alt_F', 'child_alt_R']].sum(axis=1)
    counts['child_depth'] = counts[[
        'child_ref_F', 'child_ref_R', 'child_alt_F', 'child_alt_R'
    ]].sum(axis=1)
    counts['dad_depth'] = counts[[
        'father_ref_F', 'father_ref_R', 'father_alt_F', 'father_alt_R'
    ]].sum(axis=1)
    counts['mom_depth'] = counts[[
        'mother_ref_F', 'mother_ref_R', 'mother_alt_F', 'mother_alt_R'
    ]].sum(axis=1)

    # only include sites with good sample depths (different threshold for child
    # and parents) and sufficient alts in the child
    good_depth = (counts['child_alts'] > 1) & (counts['child_depth'] > 7) & \
        (counts['dad_depth'] > 5)  &(counts['mom_depth'] > 5)
    status &= good_depth

    # check if sites deviate from expected strand bias and parental alt depths
    strand_bias, parental_site_bias = test_sites(counts, status)
    parental_gene_bias = test_genes(counts, strand_bias, status)

    # fail SNVs with excessive strand bias. Don't check strand bias in indels.
    overall_pass = (strand_bias >= P_CUTOFF) | (de_novos["ref"].str.len() != 1) | \
        (de_novos["alt"].str.len() != 1)

    # find if each de novo has passed each of three different filtering strategies
    # fail sites with gene-specific parental alts, only if >1 sites called per gene
    gene_fail = (parental_gene_bias <
                 P_CUTOFF) & de_novos["symbol"].isin(recurrent)
    site_fail = (parental_site_bias < P_CUTOFF)

    counts['parental_depth_threshold'] = counts[['dad_depth', 'mom_depth'
                                                 ]].apply(min_depth,
                                                          error=ERROR_RATE,
                                                          axis=1)

    excess_alts = counts["min_parent_alt"] > counts['parental_depth_threshold']

    # exclude sites that fail two of three classes
    sites = pandas.DataFrame({
        "gene": gene_fail,
        "site": site_fail,
        "alts": excess_alts
    })
    overall_pass[sites.sum(axis=1) >= 2] = False

    # drop out sites with poor depths
    overall_pass &= good_depth

    return overall_pass