def filter_missing_indels(candidates, *args): """ filter the candidate missing indels. We have a set of sites that have been called in the child, but not in the parents. These are the candidates for de novo mutations. Many of these sites have been checked by denovogear, for which we have a different filtering process. This function filters candidate sites which have not been examined by denovogear. Denovogear has a reduced sensitivity for indels, so this filtering only examines candidate indel sites not examined by denovogear. Args: candidates: pandas dataframe of de novo indel sites Returns: dataframe of candidate sites that pass the required criteria. """ counts = extract_alt_and_ref_counts(candidates) depths = get_depths_and_proportions(counts) depths["min_parent_depth"] = depths[["mom_depth", "dad_depth"]].min(axis=1) depths["max_parent_proportion"] = depths[["mom_prp", "dad_prp"]].max(axis=1) # apply the filtering criteria for the missing indels good_depth = depths["child_alts"] > 2 low_parental_alt = counts["min_parent_alt"] < 2 good_parental_depth = depths["min_parent_depth"] > 7 good_parental_proportion = depths["max_parent_proportion"] < 0.1 good_child_proportion = depths["child_prp"] > 0.2 return good_depth & low_parental_alt & good_parental_depth & \ good_parental_proportion & good_child_proportion
def filter_denovogear_sites(de_novos, status): """ set flags for filtering, fail samples with strand bias < threshold, or any 2 of (i) both parents have ALTs (ii) site-specific parental alts < threshold, (iii) gene-specific parental alts < threshold, if > 1 sites called in gene Args: de_novos: dataframe of de novo variants status: list (or pandas Series) or boolea Returns: vector of true/false for whether each variant passes the filters """ counts = extract_alt_and_ref_counts(de_novos) recurrent = get_recurrent_genes(de_novos) # check if sites deviate from expected strand bias and parental alt depths strand_bias, parental_site_bias = test_sites(counts, status) parental_gene_bias = test_genes(counts, strand_bias, status) # fail SNVs with excessive strand bias. Don't check strand bias in indels. overall_pass = (strand_bias >= P_CUTOFF) | (de_novos["ref"].str.len() != 1) | \ (de_novos["alt"].str.len() != 1) # find if each de novo has passed each of three different filtering strategies # fail sites with gene-specific parental alts, only if >1 sites called per gene gene_fail = (parental_gene_bias < P_CUTOFF) & de_novos["symbol"].isin(recurrent) site_fail = (parental_site_bias < P_CUTOFF) counts['dad_depth'] = counts[['father_ref_F', 'father_ref_R', 'father_alt_F', 'father_alt_R']].sum(axis=1) counts['mom_depth'] = counts[['mother_ref_F', 'mother_ref_R', 'mother_alt_F', 'mother_alt_R']].sum(axis=1) counts['parental_depth_threshold'] = counts[['dad_depth', 'mom_depth']].apply(min_depth, error=ERROR_RATE, axis=1) excess_alts = counts["min_parent_alt"] > counts['parental_depth_threshold'] # exclude sites that fail two of three classes sites = pandas.DataFrame({"gene": gene_fail, "site": site_fail, "alts": excess_alts}) overall_pass[sites.sum(axis=1) >= 2] = False return overall_pass
def test_extract_alt_and_ref_counts(self): ''' check that counting alleles from DP4 entries works correctly ''' self.compare_tables(extract_alt_and_ref_counts(self.variants), self.counts)
def test_extract_alt_and_ref_counts(self): ''' check that counting alleles from DP4 entries works correctly ''' self.compare_tables(extract_alt_and_ref_counts(self.variants), self.counts)
def filter_denovogear_sites(de_novos, status): """ set flags for filtering, fail samples with strand bias < threshold, or any 2 of (i) both parents have ALTs (ii) site-specific parental alts < threshold, (iii) gene-specific parental alts < threshold, if > 1 sites called in gene Args: de_novos: dataframe of de novo variants status: list (or pandas Series) or boolea Returns: vector of true/false for whether each variant passes the filters """ counts = extract_alt_and_ref_counts(de_novos) recurrent = get_recurrent_genes(de_novos) counts['child_alts'] = counts[['child_alt_F', 'child_alt_R']].sum(axis=1) counts['child_depth'] = counts[[ 'child_ref_F', 'child_ref_R', 'child_alt_F', 'child_alt_R' ]].sum(axis=1) counts['dad_depth'] = counts[[ 'father_ref_F', 'father_ref_R', 'father_alt_F', 'father_alt_R' ]].sum(axis=1) counts['mom_depth'] = counts[[ 'mother_ref_F', 'mother_ref_R', 'mother_alt_F', 'mother_alt_R' ]].sum(axis=1) # only include sites with good sample depths (different threshold for child # and parents) and sufficient alts in the child good_depth = (counts['child_alts'] > 1) & (counts['child_depth'] > 7) & \ (counts['dad_depth'] > 5) &(counts['mom_depth'] > 5) status &= good_depth # check if sites deviate from expected strand bias and parental alt depths strand_bias, parental_site_bias = test_sites(counts, status) parental_gene_bias = test_genes(counts, strand_bias, status) # fail SNVs with excessive strand bias. Don't check strand bias in indels. overall_pass = (strand_bias >= P_CUTOFF) | (de_novos["ref"].str.len() != 1) | \ (de_novos["alt"].str.len() != 1) # find if each de novo has passed each of three different filtering strategies # fail sites with gene-specific parental alts, only if >1 sites called per gene gene_fail = (parental_gene_bias < P_CUTOFF) & de_novos["symbol"].isin(recurrent) site_fail = (parental_site_bias < P_CUTOFF) counts['parental_depth_threshold'] = counts[['dad_depth', 'mom_depth' ]].apply(min_depth, error=ERROR_RATE, axis=1) excess_alts = counts["min_parent_alt"] > counts['parental_depth_threshold'] # exclude sites that fail two of three classes sites = pandas.DataFrame({ "gene": gene_fail, "site": site_fail, "alts": excess_alts }) overall_pass[sites.sum(axis=1) >= 2] = False # drop out sites with poor depths overall_pass &= good_depth return overall_pass