def get_rates_by_constraint(constraint, cache_dir, threshold=1e-4, ratio=1.0):
    ''' get mutation rates in and out of constrained regions
    '''

    rates = {'constrained': [], 'unconstrained': []}
    mut_dict = load_mutation_rates()
    ensembl = EnsemblRequest(cache_dir, 'grch37')
    for tx_id, group in constraint.groupby('transcript'):
        tx = construct_gene_object(ensembl, tx_id.split('.')[0])
        sites = SiteRates(tx, mut_dict)

        constrained_sites = get_constrained_positions(tx, group, threshold,
                                                      ratio)

        cqs = [
            'nonsense', 'missense', 'synonymous', 'splice_lof', 'splice_region'
        ]
        gene_rates = get_gene_rates(tx, sites, cqs, constrained_sites)

        # now add the gene rates to the larger list of all genes
        for category in ['constrained', 'unconstrained']:
            gene_rates[category]['symbol'] = list(group['gene'])[0]
            gene_rates[category]['chrom'] = list(group['chr'])[0]
            gene_rates[category]['length'] = tx.chrom_pos_to_cds(
                tx.get_cds_end())['pos']

            rates[category].append(gene_rates[category])

    return rates
def main():
    args = get_options()

    ensembl = EnsemblRequest(args.cache, args.genome_build)
    cadd = pysam.TabixFile(args.cadd)

    constraint = load_regional_constraint(args.constraint)

    # open de novo mutations
    all_de_novos = open_mutations(args.de_novos)

    mut_dict = load_mutation_rates()

    output = open(args.output, 'w')
    output.write('symbol\tseverity_p_value\n')
    for symbol in all_de_novos:
        if symbol in ['', '.']:
            continue

        print(symbol)
        de_novos = all_de_novos[symbol]
        p_value = analyse_gene(ensembl, mut_dict, cadd, symbol, de_novos,
                               constraint, WEIGHTS)
        line = '{}\t{}\n'.format(symbol, p_value)
        output.write(line)
예제 #3
0
def annotate_constraint(data, constraint_path, threshold=1e-3, ratio=0.4):
    ''' annotate per-site rates by whether the site is under regional constraint
    '''
    # default to unconstrained
    data['constrained'] = False

    constraint = load_regional_constraint(constraint_path)
    mut_dict = load_mutation_rates()
    ensembl = EnsemblRequest(cache_dir, 'grch37')

    modified = []
    for symbol, group in data.groupby('symbol'):
        if symbol not in set(constraint['gene']):
            sites = set([])
        else:
            regional = constraint[constraint['gene'] == symbol]
            tx_id = list(regional['transcript'])[0]
            tx = construct_gene_object(ensembl, tx_id.split('.')[0])
            sites = get_constrained_positions(tx, regional, threshold, ratio)

        gene_constraint = group['constrained'].copy()
        gene_constraint.loc[group['pos'].isin(sites)] = True
        group['constrained'] = gene_constraint

        modified.append(group)

    return pandas.concat(modified)
예제 #4
0
def main():

    args = get_options()

    ensembl = EnsemblRequest(args.cache_folder, args.genome_build.lower())
    mut_dict = load_mutation_rates(args.rates)
    output = open(args.out, "wt")

    args.func(ensembl, mut_dict, output, args)
예제 #5
0
def main():

    args = get_options()

    ensembl = EnsemblRequest('cache', 'grch37')
    mut_dict = load_mutation_rates()

    dominant = load_dominant(args.known)

    data = pandas.DataFrame(
        columns=['symbol', 'chrom', 'pos', 'ref', 'alt', 'cq', 'prob'])
    data['pos'] = data['pos'].astype(int)
    for symbol in dominant:
        print(symbol)
        rates = get_gene_rates(symbol, ensembl, mut_dict)
        data = data.append(rates, ignore_index=True)

    with gzip.open(args.output, 'wt') as handle:
        data.to_csv(handle, sep='\t', index=False)
예제 #6
0
 def setUpClass(self):
     self.temp_dir = tempfile.mkdtemp()
     self.ensembl = EnsemblRequest(self.temp_dir, genome_build="grch37")
예제 #7
0
class TestEnsemblRequestPy(unittest.TestCase):
    """ unit test the EnsemblRequest class
    """
    
    @classmethod
    def setUpClass(self):
        self.temp_dir = tempfile.mkdtemp()
        self.ensembl = EnsemblRequest(self.temp_dir, genome_build="grch37")
    
    @classmethod
    def tearDownClass(self):
        shutil.rmtree(self.temp_dir)
    
    def test_open_url(self):
        """ test that open_url() works correctly
        """
        
        headers = {"Content-Type": "application/json"}
        url = "http://rest.ensembl.org/overlap/id/ENSG00000172320?feature=gene"
        (response, status_code, headers) = self.ensembl.open_url(url, headers)
        
        response = json.loads(response)
        
        self.assertEqual(status_code, 200)
        self.assertEqual(response, [{
            "source": "ensembl_havana",
            "logic_name": "ensembl_havana_gene",
            "feature_type": "gene",
            "external_name": "OR5A1",
            "seq_region_name": "11",
            "strand": 1,
            "id": "ENSG00000172320",
            "gene_id": "ENSG00000172320",
            "version": 3,
            "assembly_name": "GRCh38",
            "description": "olfactory receptor family 5 subfamily A member 1 [Source:HGNC Symbol;Acc:HGNC:8319]",
            "end": 59451380,
            "biotype": "protein_coding",
            "start": 59436469}]
            )
    
    def test_get_genes_for_hgnc_id(self):
        """ test that get_genes_for_hgnc_id() works correctly
        """
        
        genes = self.ensembl.get_genes_for_hgnc_id("KMT2A")
        self.assertEqual(genes, ['ENSG00000118058', 'ENSG00000267910'])
    
    def test_get_previous_symbol(self):
        """ test that get_previous_symbol() works correctly
        """
        
        prev = self.ensembl.get_previous_symbol("KMT2A")
        self.assertEqual(prev, ["MLL"])
        
        # make a check for a gene with multiple documents, to check that we
        # don't raise an error
        prev = self.ensembl.get_previous_symbol("KRT16P1")
        self.assertEqual(prev, ["KRT14P"])
        
    def test_get_transcript_ids_for_ensembl_gene_ids(self):
        """ test that get_transcript_ids_for_ensembl_gene_ids() works correctly
        """
        
        hgnc = ["KMT2A", "MLL"]
        ensg = ['ENSG00000118058', 'ENSG00000267910']
        
        enst = self.ensembl.get_transcript_ids_for_ensembl_gene_ids(ensg, hgnc)
        
        self.assertEqual(set(enst), set(['ENST00000534358', 'ENST00000531904',
            'ENST00000389506', 'ENST00000354520', 'ENST00000532204',
            'ENST00000529852', 'ENST00000527869', 'ENST00000533790',
            'ENST00000392873']))
    
    def test_get_genomic_seq_for_transcript(self):
        """ check that get_genomic_seq_for_transcript() works correctly
        """
        
        seq = self.ensembl.get_genomic_seq_for_transcript("ENST00000302030", expand=0)
        
        self.assertEqual(seq, ('11', 59210617, 59211667, '+', 'CTTGTCCTTGTGGTCC'
            'ACGGGAAGCATGTCCATAACCAAAGCCTGGAACAGCTCATCAGTGACCATGTTCATCCTCCTGGGA'
            'TTCACAGACCATCCAGAACTCCAGGCCCTCCTCTTTGTGACCTTCCTGGGCATCTATCTTACCACC'
            'CTGGCCTGGAACCTGGCCCTCATTTTTCTGATCAGAGGTGACACCCATCTGCACACACCCATGTAC'
            'TTCTTCCTAAGCAACTTATCTTTCATTGACATCTGCTACTCTTCTGCTGTGGCTCCCAATATGCTC'
            'ACTGACTTCTTCTGGGAGCAGAAGACCATATCATTTGTGGGCTGTGCTGCTCAGTTTTTTTTCTTT'
            'GTCGGCATGGGTCTGTCTGAGTGCCTCCTCCTGACTGCTATGGCATACGACCGATATGCAGCCATC'
            'TCCAGCCCCCTTCTCTACCCCACTATCATGACCCAGGGCCTCTGTACACGCATGGTGGTTGGGGCA'
            'TATGTTGGTGGCTTCCTGAGCTCCCTGATCCAGGCCAGCTCCATATTTAGGCTTCACTTTTGCGGA'
            'CCCAACATCATCAACCACTTCTTCTGCGACCTCCCACCAGTCCTGGCTCTGTCTTGCTCTGACACC'
            'TTCCTCAGTCAAGTGGTGAATTTCCTCGTGGTGGTCACTGTCGGAGGAACATCGTTCCTCCAACTC'
            'CTTATCTCCTATGGTTACATAGTGTCTGCGGTCCTGAAGATCCCTTCAGCAGAGGGCCGATGGAAA'
            'GCCTGCAACACGTGTGCCTCGCATCTGATGGTGGTGACTCTGCTGTTTGGGACAGCCCTTTTCGTG'
            'TACTTGCGACCCAGCTCCAGCTACTTGCTAGGCAGGGACAAGGTGGTGTCTGTTTTCTATTCATTG'
            'GTGATCCCCATGCTGAACCCTCTCATTTACAGTTTGAGGAACAAAGAGATCAAGGATGCCCTGTGG'
            'AAGGTGTTGGAAAGGAAGAAAGTGTTTTCTTAGGTCATGCGTAGAAACTTATTTATCCAAACTGCT'
            'GGAGAATTAAACAATCCAAGCCTTCACCTCCACCTCTGCCTCAGG'))
    
    def test_get_cds_seq_for_transcript(self):
        """ check that get_cds_seq_for_transcript() works correctly
        """
        
        seq = self.ensembl.get_cds_seq_for_transcript("ENST00000302030")
        self.assertEqual(seq, 'ATGTCCATAACCAAAGCCTGGAACAGCTCATCAGTGACCATGTTCATC'
            'CTCCTGGGATTCACAGACCATCCAGAACTCCAGGCCCTCCTCTTTGTGACCTTCCTGGGCATCTAT'
            'CTTACCACCCTGGCCTGGAACCTGGCCCTCATTTTTCTGATCAGAGGTGACACCCATCTGCACACA'
            'CCCATGTACTTCTTCCTAAGCAACTTATCTTTCATTGACATCTGCTACTCTTCTGCTGTGGCTCCC'
            'AATATGCTCACTGACTTCTTCTGGGAGCAGAAGACCATATCATTTGTGGGCTGTGCTGCTCAGTTT'
            'TTTTTCTTTGTCGGCATGGGTCTGTCTGAGTGCCTCCTCCTGACTGCTATGGCATACGACCGATAT'
            'GCAGCCATCTCCAGCCCCCTTCTCTACCCCACTATCATGACCCAGGGCCTCTGTACACGCATGGTG'
            'GTTGGGGCATATGTTGGTGGCTTCCTGAGCTCCCTGATCCAGGCCAGCTCCATATTTAGGCTTCAC'
            'TTTTGCGGACCCAACATCATCAACCACTTCTTCTGCGACCTCCCACCAGTCCTGGCTCTGTCTTGC'
            'TCTGACACCTTCCTCAGTCAAGTGGTGAATTTCCTCGTGGTGGTCACTGTCGGAGGAACATCGTTC'
            'CTCCAACTCCTTATCTCCTATGGTTACATAGTGTCTGCGGTCCTGAAGATCCCTTCAGCAGAGGGC'
            'CGATGGAAAGCCTGCAACACGTGTGCCTCGCATCTGATGGTGGTGACTCTGCTGTTTGGGACAGCC'
            'CTTTTCGTGTACTTGCGACCCAGCTCCAGCTACTTGCTAGGCAGGGACAAGGTGGTGTCTGTTTTC'
            'TATTCATTGGTGATCCCCATGCTGAACCCTCTCATTTACAGTTTGAGGAACAAAGAGATCAAGGAT'
            'GCCCTGTGGAAGGTGTTGGAAAGGAAGAAAGTGTTTTCTTAG')
    
    def test_get_protein_seq_for_transcript(self):
        """ test that get_protein_seq_for_transcript() works correctly
        """
        
        seq = self.ensembl.get_protein_seq_for_transcript("ENST00000302030")
        self.assertEqual(seq, 'MSITKAWNSSSVTMFILLGFTDHPELQALLFVTFLGIYLTTLAWNLAL'
            'IFLIRGDTHLHTPMYFFLSNLSFIDICYSSAVAPNMLTDFFWEQKTISFVGCAAQFFFFVGMGLSE'
            'CLLLTAMAYDRYAAISSPLLYPTIMTQGLCTRMVVGAYVGGFLSSLIQASSIFRLHFCGPNIINHF'
            'FCDLPPVLALSCSDTFLSQVVNFLVVVTVGGTSFLQLLISYGYIVSAVLKIPSAEGRWKACNTCAS'
            'HLMVVTLLFGTALFVYLRPSSSYLLGRDKVVSVFYSLVIPMLNPLIYSLRNKEIKDALWKVLERKK'
            'VFS')
    
    def test_get_genomic_seq_for_region(self):
        """ test that get_genomic_seq_for_region() works correctly
        """
        
        # not that this test uses GRCh37 coordinates
        seq = self.ensembl.get_genomic_seq_for_region('11', 59210617, 59210637)
        self.assertEqual(seq, 'CTTGTCCTTGTGGTCCACGGG')
    
    def test_get_chrom_for_transcript(self):
        """ test that get_chrom_for_transcript() works correctly
        """
        
        chrom = self.ensembl.get_chrom_for_transcript("ENST00000534358", "KMT2A")
        self.assertEqual(chrom, "11")
    
    def test_get_exon_ranges_for_transcript(self):
        """ test that get_exon_ranges_for_transcript() works correctly
        """
        
        exons = self.ensembl.get_exon_ranges_for_transcript("ENST00000534358")
        self.assertEqual(exons, [(118307205, 118307659), (118339490, 118339559),
            (118342377, 118345030), (118347520, 118347697), (118348682, 118348916),
            (118350889, 118350953), (118352430, 118352807), (118353137, 118353210),
            (118354898, 118355029), (118355577, 118355690), (118359329, 118359475),
            (118360507, 118360602), (118360844, 118360964), (118361911, 118362033),
            (118362459, 118362643), (118363772, 118363945), (118365003, 118365113),
            (118365409, 118365482), (118366415, 118366608), (118366976, 118367082),
            (118368651, 118368788), (118369085, 118369243), (118370018, 118370135),
            (118370550, 118370628), (118371702, 118371862), (118372387, 118372572),
            (118373113, 118377361), (118378244, 118378324), (118379851, 118379915),
            (118380663, 118380833), (118382666, 118382740), (118390333, 118390507),
            (118390672, 118390779), (118391517, 118391600), (118392003, 118392132),
            (118392612, 118397539)])
    
    def test_get_cds_ranges_for_transcript(self):
        """ tets that get_cds_ranges_for_transcript() works correctly
        """
        
        cds = self.ensembl.get_cds_ranges_for_transcript("ENST00000534358")
        self.assertEqual(cds, [(118307228, 118307659), (118339490, 118339559),
            (118342377, 118345030), (118347520, 118347697), (118348682, 118348916),
            (118350889, 118350953), (118352430, 118352807), (118353137, 118353210),
            (118354898, 118355029), (118355577, 118355690), (118359329, 118359475),
            (118360507, 118360602), (118360844, 118360964), (118361911, 118362033),
            (118362459, 118362643), (118363772, 118363945), (118365003, 118365113),
            (118365409, 118365482), (118366415, 118366608), (118366976, 118367082),
            (118368651, 118368788), (118369085, 118369243), (118370018, 118370135),
            (118370550, 118370628), (118371702, 118371862), (118372387, 118372572),
            (118373113, 118377361), (118378244, 118378324), (118379851, 118379915),
            (118380663, 118380833), (118382666, 118382740), (118390333, 118390507),
            (118390672, 118390779), (118391517, 118391600), (118392003, 118392132),
            (118392612, 118392887)])
    
    def test_rate_limit_ensembl_requests(self):
        """ test that rate_limit_ensembl_requests() works correctly
        """
        
        current_time = time.time()
        self.ensembl.prior_time = current_time
        
        self.ensembl.rate_limit_ensembl_requests()
        delta = self.ensembl.prior_time - current_time
        
        self.assertTrue(delta >= self.ensembl.rate_limit)
예제 #8
0
def cluster_de_novos(symbol,
                     de_novos,
                     iterations=1000000,
                     ensembl=None,
                     mut_dict=None):
    """ analysis proximity cluster of de novos in a single gene
    
    Args:
        symbol: HGNC symbol for a gene
        de_novos: dictionary of de novo positions for the HGNC gene,
        indexed by functional type
        iterations: number of simulations to run
        ensembl: EnsemblRequest object, for obtaing info from ensembl
        mut_dict: dictionary of mutation rates, indexed by trinuclotide sequence
    
    Returns:
        a dictionary containing P values, and distances for missense, nonsense,
        and synonymous de novos events. Missing data is represented by "NA".
    """

    if ensembl is None:
        ensembl = EnsemblRequest('cache', 'grch37')

    if mut_dict is None:
        mut_dict = load_mutation_rates()

    missense = de_novos["missense"]
    nonsense = de_novos["nonsense"]

    # load the set of transcripts that are the  minimum set of transcripts
    # required to contain all the de novos, unless we can't find any coding
    # transcripts that contain the de novos.
    try:
        transcripts = load_gene(ensembl, symbol, missense + nonsense)
    except IndexError as e:
        print(e)
        return None

    probs = {"miss_prob": [], "nons_prob": []}
    dists = {"miss_dist": [], "nons_dist": []}

    for transcript in transcripts:

        missense_events = get_de_novos_in_transcript(transcript, missense)
        nonsense_events = get_de_novos_in_transcript(transcript, nonsense)

        rates = SiteRates(transcript, mut_dict)

        (miss_dist, miss_prob) = get_p_value(transcript, rates, iterations,
                                             "missense", missense_events)
        (nons_dist, nons_prob) = get_p_value(transcript, rates, iterations,
                                             "lof", nonsense_events)

        dists["miss_dist"].append(miss_dist)
        dists["nons_dist"].append(nons_dist)
        probs["miss_prob"].append(miss_prob)
        probs["nons_prob"].append(nons_prob)

        # remove the de novos analysed in the current transcript, so that
        # analysis of subsequent transcripts uses independent events. NOTE THAT
        # THIS MIGHT MISS SOME CLUSTERING ACROSS MUTUALLY EXCLUSIVE TRANSCRIPTS
        # IF THE DE NOVO EVENTS ARE NEAR THE TRANSCRIPT DIVERGENCE.
        missense = [x for x in missense if x not in missense_events]
        nonsense = [x for x in nonsense if x not in nonsense_events]

    for key in dists:
        dists[key] = ",".join([str(x) for x in dists[key]])

    probs = combine_p_values(probs)
    probs.update(dists)

    return probs
예제 #9
0
 def setUpClass(cls):
     cls.temp_dir = tempfile.mkdtemp()
     cls.ensembl = EnsemblRequest(cls.temp_dir, 'grch37')
     cls.mut_dict = load_mutation_rates()