def test_get_allele_counts(self):
     ''' check that counting alleles works correctly
     '''
     
     # check the counts when the variants are aggregated for a gene
     expected = {'gene_alt': 3, 'gene_ref': 195}
     self.assertEqual(get_allele_counts(self.counts, gene=True), expected)
     
     # check the counts when the variants are aggregated for a site
     expected = {'ref_F': 135, 'ref_R': 125, 'alt_F': 45, 'alt_R': 43,
         'parent_alt': 3, 'parent_ref': 195}
     self.assertEqual(get_allele_counts(self.counts, gene=False), expected)
예제 #2
0
    def test_get_allele_counts(self):
        ''' check that counting alleles works correctly
        '''

        # check the counts when the variants are aggregated for a gene
        expected = {'gene_alt': 3, 'gene_ref': 195}
        self.assertEqual(get_allele_counts(self.counts, gene=True), expected)

        # check the counts when the variants are aggregated for a site
        expected = {
            'ref_F': 135,
            'ref_R': 125,
            'alt_F': 45,
            'alt_R': 43,
            'parent_alt': 3,
            'parent_ref': 195
        }
        self.assertEqual(get_allele_counts(self.counts, gene=False), expected)
예제 #3
0
def test_sites(de_novos, pass_status=None):
    """ tests each site for deviation from expected behaviour
    
    Args:
        de_novos: dataframe of de novo variants
        pass_status: whether the candidate passed prelimary filtering for MAF,
            and segdups. Sometimes we pass in tables that include variaants that
            failed MAF etc, since instead of filtering we want a column
            indicating pass status. We need to exclude these variants from the
            strand bias and parental alt checks.
    
    Returns:
        tuple of pandas Series, one of p-values from testing if the variants have
        a strand bias, and the second of p-values from testing if the variants
        have an excess of parental alts.
    """

    de_novos["key"] = list(
        zip(de_novos["chrom"], de_novos["pos"], de_novos["alt"]))

    alleles = de_novos[[
        "key", "child_ref_F", "child_ref_R", "child_alt_F", "child_alt_R",
        "mother_ref_F", "mother_ref_R", "mother_alt_F", "mother_alt_R",
        "father_ref_F", "father_ref_R", "father_alt_F", "father_alt_R"
    ]].copy()

    if pass_status is not None:
        alleles = alleles[pass_status]

    variants = alleles.groupby("key")

    # count the ref and alt alleles for each de novo site
    counts = [get_allele_counts(site) for name, site in variants]
    results = pandas.DataFrame(counts)
    results["key"] = [name for name, site in variants]

    # check for overabundance of parental alt alleles using binomial test
    parent_counts = pandas.DataFrame({
        "alt": results["parent_alt"],
        "ref": results["parent_ref"]
    })
    parental_alt_p = parent_counts.apply(scipy.stats.binom_test,
                                         axis=1,
                                         p=ERROR_RATE)
    recode = dict(zip(results['key'], parental_alt_p))
    parental_bias = de_novos['key'].map(recode)

    # check for strand bias by fishers exact test on the allele counts
    strand_bias_p = results.apply(site_strand_bias, axis=1)
    recode = dict(zip(results['key'], strand_bias_p))
    strand_bias = de_novos['key'].map(recode)

    return strand_bias, parental_bias
예제 #4
0
def test_genes(de_novos, strand_bias, pass_status=None):
    """ checks if the variants in a gene have more parental ALTs than expected
    
    Args:
        de_novos: dataframe of de novo variants
    
    Returns:
        p-value for whether the forward or reverse are biased in the proportion
        of ref and alt alleles within each gene.
    """

    sites = de_novos.copy()

    if pass_status is not None:
        sites = sites[pass_status]
        strand_bias = strand_bias.copy()[pass_status]

    # exclude de novo SNVs that fail the strand bias filter, otherwise these
    # skew the parental alts within genes
    sites = sites[(strand_bias >= P_CUTOFF) & (sites["ref"].str.len() == 1) &
                  (sites["alt"].str.len() == 1)]

    # cover the edge case where we don't have any sites for testing
    if len(sites) == 0:
        return [float('nan')] * len(de_novos)

    sites = sites[[
        "symbol", "mother_ref_F", "mother_ref_R", "mother_alt_F",
        "mother_alt_R", "father_ref_F", "father_ref_R", "father_alt_F",
        "father_alt_R"
    ]].copy()

    # count the number of parental alleles within genes, and restructure data
    # for testing
    genes = sites.groupby("symbol")
    counts = [get_allele_counts(site, gene=True) for name, site in genes]
    results = pandas.DataFrame(counts)
    results["symbol"] = [name for name, site in genes]

    # check for overabundance of parental alt alleles using binomial test
    parent_counts = pandas.DataFrame({
        "alt": results["gene_alt"],
        "ref": results["gene_ref"]
    })
    parental_alt_p = parent_counts.apply(scipy.stats.binom_test,
                                         axis=1,
                                         p=ERROR_RATE)
    recode = dict(zip(results["symbol"], parental_alt_p))

    return de_novos['symbol'].map(recode)
예제 #5
0
def test_sites(de_novos, pass_status=None):
    """ tests each site for deviation from expected behaviour
    
    Args:
        de_novos: dataframe of de novo variants
        pass_status: whether the candidate passed prelimary filtering for MAF,
            and segdups. Sometimes we pass in tables that include variaants that
            failed MAF etc, since instead of filtering we want a column
            indicating pass status. We need to exclude these variants from the
            strand bias and parental alt checks.
    
    Returns:
        tuple of pandas Series, one of p-values from testing if the variants have
        a strand bias, and the second of p-values from testing if the variants
        have an excess of parental alts.
    """
    
    de_novos["key"] = list(zip(de_novos["chrom"], de_novos["pos"], de_novos["alt"]))
    
    alleles = de_novos[["key",
        "child_ref_F", "child_ref_R", "child_alt_F", "child_alt_R",
        "mother_ref_F", "mother_ref_R", "mother_alt_F", "mother_alt_R",
        "father_ref_F", "father_ref_R", "father_alt_F", "father_alt_R"]].copy()
    
    if pass_status is not None:
        alleles = alleles[pass_status]
    
    alleles = alleles.convert_objects(convert_numeric=True)
    variants = alleles.groupby("key")
    
    # count the ref and alt alleles for each de novo site
    counts = [ get_allele_counts(site) for name, site in variants ]
    results = pandas.DataFrame(counts)
    results["key"] = [ name for name, site in variants ]
    
    # check for overabundance of parental alt alleles using binomial test
    parent_counts = pandas.DataFrame({"alt": results["parent_alt"], "ref": results["parent_ref"]})
    parental_alt_p = parent_counts.apply(scipy.stats.binom_test, axis=1, p=ERROR_RATE)
    recode = dict(zip(results['key'], parental_alt_p))
    parental_bias = de_novos['key'].map(recode)
    
    # check for strand bias by fishers exact test on the allele counts
    strand_bias_p = results.apply(site_strand_bias, axis=1)
    recode = dict(zip(results['key'], strand_bias_p))
    strand_bias = de_novos['key'].map(recode)
    
    return strand_bias, parental_bias
예제 #6
0
def test_genes(de_novos, strand_bias, pass_status=None):
    """ checks if the variants in a gene have more parental ALTs than expected
    
    Args:
        de_novos: dataframe of de novo variants
    
    Returns:
        p-value for whether the forward or reverse are biased in the proportion
        of ref and alt alleles within each gene.
    """
    
    sites = de_novos.copy()
    
    if pass_status is not None:
        sites = sites[pass_status]
        strand_bias = strand_bias.copy()[pass_status]
    
    # exclude de novo SNVs that fail the strand bias filter, otherwise these
    # skew the parental alts within genes
    sites = sites[(strand_bias >= P_CUTOFF) & (de_novos["ref"].str.len() == 1) &
        (de_novos["alt"].str.len() == 1)]
    
    # cover the edge case where we don't have any sites for testing
    if len(sites) == 0:
        return [float('nan')] * len(de_novos)
    
    sites = sites[["symbol",
        "mother_ref_F", "mother_ref_R", "mother_alt_F", "mother_alt_R",
        "father_ref_F", "father_ref_R", "father_alt_F", "father_alt_R"]].copy()
    
    sites = sites.convert_objects(convert_numeric=True)
    
    # count the number of parental alleles within genes, and restructure data
    # for testing
    genes = sites.groupby("symbol")
    counts = [ get_allele_counts(site, gene=True) for name, site in genes ]
    results = pandas.DataFrame(counts)
    results["symbol"] = [ name for name, site in genes ]
    
    # check for overabundance of parental alt alleles using binomial test
    parent_counts = pandas.DataFrame({"alt": results["gene_alt"], "ref": results["gene_ref"]})
    parental_alt_p = parent_counts.apply(scipy.stats.binom_test, axis=1, p=ERROR_RATE)
    recode = dict(zip(results["symbol"], parental_alt_p))
    
    return de_novos['symbol'].map(recode)