def test_preliminary_filtering(self): ''' test that preliminary_filtering works correctly. ''' # for a clean table, all variants should pass status = preliminary_filtering(self.variants) self.assertTrue(all(status == Series([True, True]))) # when we define a set of samples that fail, their variants will not pass status = preliminary_filtering(self.variants, sample_fails=['b']) self.assertTrue(all(status == Series([True, False]))) # when we adjust the MAF threshold, candidates above will faik status = preliminary_filtering(self.variants, maf_cutoff=0.0001) self.assertTrue(all(status == Series([False, True]))) # if we set a parent to have been called, that site will fail self.variants['in_father_vcf'] = [1, 0] status = preliminary_filtering(self.variants) self.assertTrue(all(status == Series([False, True])))
def screen_candidates(de_novos_path, fails_path, filter_function, maf=0.01, fix_symbols=True, annotate_only=False, build='grch37'): """ load and optionally filter candidate de novo mutations. Args: de_novos_path: path to table of unfiltered canddiate DNMs fails_path: path to file listing samples which failed QC, and therefore all of their candidates need to be excluded. filter_function: function for filtering the candidates, either filter_denovogear_sites(), or filter_missing_indels(). maf: MAF threshold for filtering. This is 0.01 for denovogear sites, and 0 for the missing indels. fix_symbols: whether to annotate HGNC symbols for candidates missing these. annotate_only: whether to include a column indicating pass status, rather than excluding all candidates which fail the filtering. build: whether to use the 'grch37' or 'grch38' build to get missing symbols. Returns: pandas DataFrame of candidate de novo mutations. """ if de_novos_path is None: return None # load the datasets de_novos = load_candidates(de_novos_path) sample_fails = [] if fails_path is not None: sample_fails = [x.strip() for x in open(fails_path)] # run some initial screening status = preliminary_filtering(de_novos, sample_fails, maf_cutoff=maf) segdup = check_segdups(de_novos) if fix_symbols: de_novos['symbol'] = fix_missing_gene_symbols(de_novos, build) pass_status = filter_function(de_novos, status & segdup) & status & segdup if annotate_only: de_novos['pass'] = pass_status else: de_novos = de_novos[pass_status] return standardise_columns(de_novos)
def screen_candidates(de_novos_path, fails_path, filter_function, maf=0.01, fix_symbols=True, annotate_only=False): """ load and optionally filter candidate de novo mutations. Args: de_novos_path: path to table of unfiltered canddiate DNMs fails_path: path to file listing samples which failed QC, and therefore all of their candidates need to be excluded. filter_function: function for filtering the candidates, either filter_denovogear_sites(), or filter_missing_indels(). maf: MAF threshold for filtering. This is 0.01 for denovogear sites, and 0 for the missing indels. fix_symbols: whether to annotate HGNC symbols for candidates missing these. annotate_only: whether to include a column indicating pass status, rather than excluding all candidates which fail the filtering. Returns: pandas DataFrame of candidate de novo mutations. """ if de_novos_path is None: return None # load the datasets de_novos = load_candidates(de_novos_path) sample_fails = [] if fails_path is not None: sample_fails = [ x.strip() for x in open(fails_path) ] # run some initial screening status = preliminary_filtering(de_novos, sample_fails, maf_cutoff=maf) segdup = check_segdups(de_novos) if fix_symbols: de_novos['symbol'] = fix_missing_gene_symbols(de_novos) pass_status = filter_function(de_novos, status & segdup) & status & segdup if annotate_only: de_novos['pass'] = pass_status else: de_novos = de_novos[pass_status] return standardise_columns(de_novos)