Пример #1
0
def main():
    """ run the clinical filtering analyses
    """

    args = get_options()

    # set the level of logging to generate
    numeric_level = getattr(logging, args.loglevel.upper(), None)

    log_filename = "clinical-filter.log"
    if args.ped is not None:
        log_filename = args.ped + ".log"

    logging.basicConfig(level=numeric_level, filename=log_filename)

    families = get_families(args)
    count = sum([y.is_affected() for x in families for y in x.children])

    finder = Filter(args.populations, count, args.known_genes, args.genes_date,
                    args.regions, args.lof_sites, args.pp_filter,
                    args.sum_x_lr2_file, args.output, args.export_vcf,
                    args.debug_chrom, args.debug_pos)

    for family in families:
        finder.filter_trio(family)
    def setUp(self):
        """ create a default Filter object to test
        """

        count = 1
        regions = None
        known_genes, genes_date = None, None
        output_path, export_vcf = None, None
        debug_chrom, debug_pos = None, None
        pp_filter = 0.9
        lof_sites = None
        population_tags = [
            "AFR_AF", "AMR_AF", "ASN_AF", "DDD_AF", "EAS_AF", "ESP_AF",
            "EUR_AF", "MAX_AF", "SAS_AF", "UK10K_cohort_AF"
        ]
        sum_x_lr2 = None  #

        self.finder = Filter(population_tags, count, known_genes, genes_date,
                             regions, lof_sites, pp_filter, sum_x_lr2,
                             output_path, export_vcf, debug_chrom, debug_pos)
def main():
    """ run the clinical filtering analyses
    """
    
    args = get_options()
    
    # set the level of logging to generate
    numeric_level = getattr(logging, args.loglevel.upper(), None)
    
    log_filename = "clinical-filter.log"
    if args.ped is not None:
        log_filename = args.ped + ".log"
    
    logging.basicConfig(level=numeric_level, filename=log_filename)
    
    families = get_families(args)
    count = sum([ y.is_affected() for x in families for y in x.children ])
    
    finder = Filter(args.populations, count, args.known_genes, args.genes_date, 
                    args.regions, args.lof_sites, args.pp_filter, args.sum_x_lr2_file, args.output, 
                    args.export_vcf, args.debug_chrom, args.debug_pos)
    
    for family in families:
        finder.filter_trio(family)
Пример #4
0
 def setUp(self):
     """ create a default Filter object to test
     """
     
     count = 1
     regions = None
     known_genes, genes_date = None, None
     output_path, export_vcf = None, None
     debug_chrom, debug_pos = None, None
     pp_filter = 0.9
     lof_sites = None
     population_tags = ["AFR_AF", "AMR_AF", "ASN_AF", "DDD_AF", "EAS_AF",
         "ESP_AF", "EUR_AF", "MAX_AF", "SAS_AF", "UK10K_cohort_AF"]
     
     self.finder = Filter(population_tags, count, known_genes, genes_date,
         regions, lof_sites, pp_filter, output_path, export_vcf, debug_chrom,
         debug_pos)
Пример #5
0
class TestFilterPy(unittest.TestCase):
    """ test the Filter class
    """
    
    @classmethod
    def setUpClass(cls):
        cls.temp_dir = tempfile.mkdtemp()
    
    @classmethod
    def tearDownClass(cls):
        shutil.rmtree(cls.temp_dir)
    
    def setUp(self):
        """ create a default Filter object to test
        """
        
        count = 1
        regions = None
        known_genes, genes_date = None, None
        output_path, export_vcf = None, None
        debug_chrom, debug_pos = None, None
        pp_filter = 0.9
        lof_sites = None
        population_tags = ["AFR_AF", "AMR_AF", "ASN_AF", "DDD_AF", "EAS_AF",
            "ESP_AF", "EUR_AF", "MAX_AF", "SAS_AF", "UK10K_cohort_AF"]
        
        self.finder = Filter(population_tags, count, known_genes, genes_date,
            regions, lof_sites, pp_filter, output_path, export_vcf, debug_chrom,
            debug_pos)
    
    def test_analyse_trio(self):
        ''' test that analyse_trio() works correctly
        '''
        
        # construct the VCFs for the trio members
        paths = {}
        for member in ['child', 'mom', 'dad']:
            vcf = make_vcf_header()
            
            geno, pp_dnm = '0/0', ''
            if member == 'child':
                geno, pp_dnm = '0/1', ';DENOVO-SNP;PP_DNM=1'
            
            vcf.append(make_vcf_line(genotype=geno, extra='HGNC=ARID1B' + pp_dnm))
            
            # write the VCF data to a file
            handle = tempfile.NamedTemporaryFile(dir=self.temp_dir, delete=False,
                suffix='.vcf')
            for x in vcf:
                handle.write(x.encode('utf8'))
            handle.flush()
            
            paths[member] = handle.name
        
        # create a Family object, so we can load the data from the trio's VCFs
        fam_id = 'fam01'
        child = Person(fam_id, 'child', 'dad', 'mom', 'female', '2', paths['child'])
        mom = Person(fam_id, 'mom', '0', '0', 'female', '1', paths['mom'])
        dad = Person(fam_id, 'dad', '0', '0', 'male', '1', paths['dad'])
        family = Family(fam_id, [child], mom, dad)
        
        self.assertEqual(self.finder.analyse_trio(family),
            [(TrioGenotypes(chrom="1", pos=1,
                child=SNV(chrom="1", position=1, id=".", ref="G", alts="T",
                    filter="PASS",
                    info="CQ=missense_variant;DENOVO-SNP;HGNC=ARID1B;PP_DNM=1",
                    format="DP:GT", sample="50:0/1", gender="female", mnv_code=None),
                mother=SNV(chrom="1", position=1, id=".", ref="G", alts="T",
                    filter="PASS", info="CQ=missense_variant;HGNC=ARID1B",
                    format="DP:GT", sample="50:0/0", gender="female", mnv_code=None),
                father=SNV(chrom="1", position=1, id=".", ref="G", alts="T",
                    filter="PASS", info="CQ=missense_variant;HGNC=ARID1B",
                    format="DP:GT", sample="50:0/0", gender="male", mnv_code=None)),
            ['single_variant'], ['Monoallelic'], ['ARID1B'])])
    
    def test_create_gene_dict(self):
        """ test that create_gene_dict works correctly
        """
        
        # create variants that share genes, or not
        snv1 = create_variant("F", "missense_variant|missense_variant", "TEST1|TEST2")
        snv2 = create_variant("F", "missense_variant", "TEST1")
        snv3 = create_variant("F", "missense_variant", "OTHER1")
        
        # the variants that share a gene should be grouped in lists indexed by
        # the gene key
        self.assertEqual(self.finder.create_gene_dict([snv1, snv2, snv3]),
            {"TEST1": [snv1, snv2], "TEST2": [snv1], "OTHER1": [snv3]})
    
    def test_find_variants(self):
        """ test that find_variants() works correctly
        """
        
        # define the trio, so that we can know whether the parents are affected.
        # The child also needs to be included and set, so that we can get the
        # child ID for logging purposes.
        family = Family("famID")
        family.add_child("child_id", 'dad_id', 'mom_id', 'f', '2', "/vcf/path")
        family.add_father("dad_id", '0', '0', 'm', '1', "/vcf/path")
        family.add_mother("mom_id", '0', '0', 'f', '1', "/vcf/path")
        family.set_child()
        
        # create variants that cover various scenarios
        snv1 = create_variant("F", "missense_variant|missense_variant", "TEST1|TEST2")
        snv2 = create_variant("F", "missense_variant|synonymous_variant", "OTHER1|OTHER2")
        snv3 = create_variant("F", "missense_variant", "")
        snv4 = create_variant("F", "missense_variant", "TESTX", chrom="X")
        
        self.finder.known_genes = {"TEST1": {"inh": ["Monoallelic"]},
            "OTHER1": {"inh": ["Monoallelic"]},
            "OTHER2": {"inh": ["Monoallelic"]},
            "TESTX": {"inh": ["X-linked dominant"]}}
        
        # check the simplest case, a variant in a known gene
        self.assertEqual(self.finder.find_variants([snv1], "TEST1", family),
            [(snv1, ["single_variant"], ["Monoallelic"], ["TEST1"])])
        
        # check that a gene not in a known gene does not pass
        self.assertEqual(self.finder.find_variants([snv1], "TEST2", family), [])
        
        # check a variant where the gene is known, but the consequence for that
        # gene is not functional, does not pass
        self.assertEqual(self.finder.find_variants([snv2], "OTHER2", family), [])
        
        # check that intergenic variants (which lack HGNC symbols) do not pass
        self.assertEqual(self.finder.find_variants([snv3], None, family), [])
        
        # check that a variant on chrX passes through the allosomal instance
        self.assertEqual(self.finder.find_variants([snv4], "TESTX", family),
            [(snv4, ["single_variant"], ["X-linked dominant"], ["TESTX"])])
        
        # remove the known genes, so that the variants in unknown genes pass
        self.finder.known_genes = None
        self.assertEqual(self.finder.find_variants([snv1], "TEST2", family),
            [(snv1, ["single_variant"], ["Monoallelic"], ["TEST2"])])
        
        # but variants without gene symbols still are excluded
        self.assertEqual(self.finder.find_variants([snv3], None, family), [])
    
    def test_exclude_duplicates(self):
        """ test that exclude duplicates works correctly
        """
        
        # create a variant that is within two genes
        snv1 = create_variant("F", "missense_variant|missense_variant", "TEST1|TEST2")
        
        # two variants that lie in different genes on different chromosomes
        # should not be merged
        snv2 = create_variant("F", "missense_variant", "OTHER1", chrom="2")
        variants = [(snv1, ["single_variant"], ["Monoallelic"], ["TEST1"]),
            ((snv2, ["single_variant"], ["Monoallelic"], ["OTHER1"]))]
        self.assertEqual(sorted(self.finder.exclude_duplicates(variants)), sorted(variants))
        
        # create a list of variant tuples that passed filtering for two
        # different gene symbols
        variants = [(snv1, ["single_variant"], ["Monoallelic"], ["TEST1"]),
            ((snv1, ["compound_het"], ["Biallelic"], ["TEST1"])),
            ((snv1, ["compound_het"], ["Biallelic"], ["TEST1"]))]
        self.assertEqual(self.finder.exclude_duplicates(variants),
            [(snv1, ["single_variant", "compound_het"], ["Monoallelic", "Biallelic"], ["TEST1"])])
        
        # create a list of variant tuples that passed filtering for two
        # different gene symbols
        variants = [(snv1, ["single_variant"], ["Monoallelic"], ["TEST1"]),
            ((snv1, ["single_variant"], ["Monoallelic"], ["TEST2"]))]
        
        # the same variant passing for two gene symbols should be collapsed
        # into a single entry, where the entry contains a list ofall the gene
        # symbols
        self.assertEqual(self.finder.exclude_duplicates(variants),
            [(snv1, ["single_variant"], ["Monoallelic"], ["TEST1", "TEST2"])])
class TestFilterPy(unittest.TestCase):
    """ test the Filter class
    """
    @classmethod
    def setUpClass(cls):
        cls.temp_dir = tempfile.mkdtemp()

    @classmethod
    def tearDownClass(cls):
        shutil.rmtree(cls.temp_dir)

    def setUp(self):
        """ create a default Filter object to test
        """

        count = 1
        regions = None
        known_genes, genes_date = None, None
        output_path, export_vcf = None, None
        debug_chrom, debug_pos = None, None
        pp_filter = 0.9
        lof_sites = None
        population_tags = [
            "AFR_AF", "AMR_AF", "ASN_AF", "DDD_AF", "EAS_AF", "ESP_AF",
            "EUR_AF", "MAX_AF", "SAS_AF", "UK10K_cohort_AF"
        ]
        sum_x_lr2 = None  #

        self.finder = Filter(population_tags, count, known_genes, genes_date,
                             regions, lof_sites, pp_filter, sum_x_lr2,
                             output_path, export_vcf, debug_chrom, debug_pos)

    def test_analyse_trio(self):
        ''' test that analyse_trio() works correctly
        '''

        # construct the VCFs for the trio members
        paths = {}
        for member in ['child', 'mom', 'dad']:
            vcf = make_vcf_header()

            geno, pp_dnm = '0/0', ''
            if member == 'child':
                geno, pp_dnm = '0/1', ';DENOVO-SNP;PP_DNM=1'

            vcf.append(
                make_vcf_line(genotype=geno, extra='HGNC=ARID1B' + pp_dnm))

            # write the VCF data to a file
            handle = tempfile.NamedTemporaryFile(dir=self.temp_dir,
                                                 delete=False,
                                                 suffix='.vcf')
            for x in vcf:
                handle.write(x.encode('utf8'))
            handle.flush()

            paths[member] = handle.name

        # create a Family object, so we can load the data from the trio's VCFs
        fam_id = 'fam01'
        child = Person(fam_id, 'child', 'dad', 'mom', 'female', '2',
                       paths['child'])
        mom = Person(fam_id, 'mom', '0', '0', 'female', '1', paths['mom'])
        dad = Person(fam_id, 'dad', '0', '0', 'male', '1', paths['dad'])
        family = Family(fam_id, [child], mom, dad)

        self.assertEqual(self.finder.analyse_trio(family), [(TrioGenotypes(
            chrom="1",
            pos=1,
            child=SNV(
                chrom="1",
                position=1,
                id=".",
                ref="G",
                alts="T",
                qual='1000',
                filter="PASS",
                info="CQ=missense_variant;DENOVO-SNP;HGNC=ARID1B;PP_DNM=1",
                format="DP:GT",
                sample="50:0/1",
                gender="female",
                mnv_code=None),
            mother=SNV(chrom="1",
                       position=1,
                       id=".",
                       ref="G",
                       alts="T",
                       qual='1000',
                       filter="PASS",
                       info="CQ=missense_variant;HGNC=ARID1B",
                       format="DP:GT",
                       sample="50:0/0",
                       gender="female",
                       mnv_code=None),
            father=SNV(chrom="1",
                       position=1,
                       id=".",
                       ref="G",
                       alts="T",
                       qual='1000',
                       filter="PASS",
                       info="CQ=missense_variant;HGNC=ARID1B",
                       format="DP:GT",
                       sample="50:0/0",
                       gender="male",
                       mnv_code=None)), ['single_variant'], [
                           'Monoallelic', 'Mosaic'
                       ], ['ARID1B'])])

    def test_create_gene_dict(self):
        """ test that create_gene_dict works correctly
        """

        # create variants that share genes, or not
        snv1 = create_variant("F", "missense_variant|missense_variant",
                              "TEST1|TEST2")
        snv2 = create_variant("F", "missense_variant", "TEST1")
        snv3 = create_variant("F", "missense_variant", "OTHER1")

        # the variants that share a gene should be grouped in lists indexed by
        # the gene key
        self.assertEqual(self.finder.create_gene_dict([snv1, snv2, snv3]), {
            "TEST1": [snv1, snv2],
            "TEST2": [snv1],
            "OTHER1": [snv3]
        })

    def test_find_variants(self):
        """ test that find_variants() works correctly
        """

        # define the trio, so that we can know whether the parents are affected.
        # The child also needs to be included and set, so that we can get the
        # child ID for logging purposes.
        family = Family("famID")
        family.add_child("child_id", 'dad_id', 'mom_id', 'f', '2', "/vcf/path")
        family.add_father("dad_id", '0', '0', 'm', '1', "/vcf/path")
        family.add_mother("mom_id", '0', '0', 'f', '1', "/vcf/path")
        family.set_child()

        # create variants that cover various scenarios
        snv1 = create_variant("F", "missense_variant|missense_variant",
                              "TEST1|TEST2")
        snv2 = create_variant("F", "missense_variant|synonymous_variant",
                              "OTHER1|OTHER2")
        snv3 = create_variant("F", "missense_variant", "")
        snv4 = create_variant("F", "missense_variant", "TESTX", chrom="X")

        self.finder.known_genes = {
            "TEST1": {
                "inh": ["Monoallelic"]
            },
            "OTHER1": {
                "inh": ["Monoallelic"]
            },
            "OTHER2": {
                "inh": ["Monoallelic"]
            },
            "TESTX": {
                "inh": ["X-linked dominant"]
            }
        }

        # check the simplest case, a variant in a known gene
        self.assertEqual(
            self.finder.find_variants([snv1], "TEST1", family),
            [(snv1, ["single_variant"], ["Monoallelic"], ["TEST1"])])

        # check that a gene not in a known gene does not pass
        self.assertEqual(self.finder.find_variants([snv1], "TEST2", family),
                         [])

        # check a variant where the gene is known, but the consequence for that
        # gene is not functional, does not pass
        self.assertEqual(self.finder.find_variants([snv2], "OTHER2", family),
                         [])

        # check that intergenic variants (which lack HGNC symbols) do not pass
        self.assertEqual(self.finder.find_variants([snv3], None, family), [])

        # check that a variant on chrX passes through the allosomal instance
        self.assertEqual(
            self.finder.find_variants([snv4], "TESTX", family),
            [(snv4, ["single_variant"], ["X-linked dominant"], ["TESTX"])])

        # remove the known genes, so that the variants in unknown genes pass
        self.finder.known_genes = None
        self.assertEqual(
            sorted(self.finder.find_variants([snv1], "TEST2", family)),
            [(snv1, ["single_variant"], ["Monoallelic"], ["TEST2"]),
             (snv1, ["single_variant"], ["Mosaic"], ["TEST2"])])

        # but variants without gene symbols still are excluded
        self.assertEqual(self.finder.find_variants([snv3], None, family), [])

    def test_exclude_duplicates(self):
        """ test that exclude duplicates works correctly
        """

        # create a variant that is within two genes
        snv1 = create_variant("F", "missense_variant|missense_variant",
                              "TEST1|TEST2")

        # two variants that lie in different genes on different chromosomes
        # should not be merged
        snv2 = create_variant("F", "missense_variant", "OTHER1", chrom="2")
        variants = [(snv1, ["single_variant"], ["Monoallelic"], ["TEST1"]),
                    ((snv2, ["single_variant"], ["Monoallelic"], ["OTHER1"]))]
        self.assertEqual(sorted(self.finder.exclude_duplicates(variants)),
                         sorted(variants))

        # create a list of variant tuples that passed filtering for two
        # different gene symbols
        variants = [(snv1, ["single_variant"], ["Monoallelic"], ["TEST1"]),
                    ((snv1, ["compound_het"], ["Biallelic"], ["TEST1"])),
                    ((snv1, ["compound_het"], ["Biallelic"], ["TEST1"]))]
        self.assertEqual(self.finder.exclude_duplicates(variants),
                         [(snv1, ["compound_het", "single_variant"
                                  ], ["Biallelic", "Monoallelic"], ["TEST1"])])

        # create a list of variant tuples that passed filtering for two
        # different gene symbols
        variants = [(snv1, ["single_variant"], ["Monoallelic"], ["TEST1"]),
                    ((snv1, ["single_variant"], ["Monoallelic"], ["TEST2"]))]

        # the same variant passing for two gene symbols should be collapsed
        # into a single entry, where the entry contains a list ofall the gene
        # symbols
        self.assertEqual(
            self.finder.exclude_duplicates(variants),
            [(snv1, ["single_variant"], ["Monoallelic"], ["TEST1", "TEST2"])])