def main(): args = get_options() ensembl = EnsemblRequest(args.cache, args.genome_build) cadd = pysam.TabixFile(args.cadd) constraint = load_regional_constraint(args.constraint) # open de novo mutations all_de_novos = open_mutations(args.de_novos) mut_dict = load_mutation_rates() output = open(args.output, 'w') output.write('symbol\tseverity_p_value\n') for symbol in all_de_novos: if symbol in ['', '.']: continue print(symbol) de_novos = all_de_novos[symbol] p_value = analyse_gene(ensembl, mut_dict, cadd, symbol, de_novos, constraint, WEIGHTS) line = '{}\t{}\n'.format(symbol, p_value) output.write(line)
def get_rates_by_constraint(constraint, cache_dir, threshold=1e-4, ratio=1.0): ''' get mutation rates in and out of constrained regions ''' rates = {'constrained': [], 'unconstrained': []} mut_dict = load_mutation_rates() ensembl = EnsemblRequest(cache_dir, 'grch37') for tx_id, group in constraint.groupby('transcript'): tx = construct_gene_object(ensembl, tx_id.split('.')[0]) sites = SiteRates(tx, mut_dict) constrained_sites = get_constrained_positions(tx, group, threshold, ratio) cqs = [ 'nonsense', 'missense', 'synonymous', 'splice_lof', 'splice_region' ] gene_rates = get_gene_rates(tx, sites, cqs, constrained_sites) # now add the gene rates to the larger list of all genes for category in ['constrained', 'unconstrained']: gene_rates[category]['symbol'] = list(group['gene'])[0] gene_rates[category]['chrom'] = list(group['chr'])[0] gene_rates[category]['length'] = tx.chrom_pos_to_cds( tx.get_cds_end())['pos'] rates[category].append(gene_rates[category]) return rates
def annotate_constraint(data, constraint_path, threshold=1e-3, ratio=0.4): ''' annotate per-site rates by whether the site is under regional constraint ''' # default to unconstrained data['constrained'] = False constraint = load_regional_constraint(constraint_path) mut_dict = load_mutation_rates() ensembl = EnsemblRequest(cache_dir, 'grch37') modified = [] for symbol, group in data.groupby('symbol'): if symbol not in set(constraint['gene']): sites = set([]) else: regional = constraint[constraint['gene'] == symbol] tx_id = list(regional['transcript'])[0] tx = construct_gene_object(ensembl, tx_id.split('.')[0]) sites = get_constrained_positions(tx, regional, threshold, ratio) gene_constraint = group['constrained'].copy() gene_constraint.loc[group['pos'].isin(sites)] = True group['constrained'] = gene_constraint modified.append(group) return pandas.concat(modified)
def main(): args = get_options() ensembl = EnsemblRequest(args.cache_folder, args.genome_build.lower()) mut_dict = load_mutation_rates(args.rates) output = open(args.out, "wt") args.func(ensembl, mut_dict, output, args)
async def runner(): args = get_options() FORMAT = '%(asctime)-15s %(message)s' logging.basicConfig(filename=args.log, format=FORMAT, level=logging.INFO) async with RateLimiter(per_second=15) as ensembl: mut_dict = load_mutation_rates(args.rates) with open(args.out, "wt") as output: await args.func(ensembl, mut_dict, output, args)
def main(): args = get_options() ensembl = EnsemblRequest('cache', 'grch37') mut_dict = load_mutation_rates() dominant = load_dominant(args.known) data = pandas.DataFrame( columns=['symbol', 'chrom', 'pos', 'ref', 'alt', 'cq', 'prob']) data['pos'] = data['pos'].astype(int) for symbol in dominant: print(symbol) rates = get_gene_rates(symbol, ensembl, mut_dict) data = data.append(rates, ignore_index=True) with gzip.open(args.output, 'wt') as handle: data.to_csv(handle, sep='\t', index=False)
def main(): input_transcripts, input_genes, output_file, rates_file, cache_dir, \ genome_build = get_options() # load all the data ensembl = EnsemblRequest(cache_dir, genome_build) mut_dict = load_mutation_rates(rates_file) if input_transcripts is not None: transcripts = load_transcripts(input_transcripts) else: transcripts = load_genes(input_genes) output = open(output_file, "w") output.write("transcript_id\tchrom\tlength\tmissense_rate\tnonsense_rate\t" "splice_lof_rate\tsplice_region_rate\tsynonymous_rate\n") for gene_id in sorted(transcripts): print(gene_id) try: rates = get_mutation_rates(gene_id, transcripts, mut_dict, ensembl) chrom = rates[0] length = rates[1] rates = rates[2:] # log transform the rates, to keep them consistent with the rates from # Daly et al. line = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\n".format(gene_id, \ chrom, length, *log_transform(rates)) except ValueError as error: line = "{0}\t{1}\n".format(gene_id, error) except KeyError as error: # ignore genes with odd genomic sequence eg ENST00000436041 in GRCh37 continue output.write(line) output.close() include_indel_rates(output_file)
async def cluster_de_novos(symbol, de_novos, ensembl, iterations=1000000, mut_dict=None): """ analysis proximity cluster of de novos in a single gene Args: symbol: HGNC symbol for a gene de_novos: dictionary of de novo positions for the HGNC gene, indexed by functional type iterations: number of simulations to run ensembl: EnsemblRequest object, for obtaing info from ensembl mut_dict: dictionary of mutation rates, indexed by trinuclotide sequence Returns: a dictionary containing P values, and distances for missense, nonsense, and synonymous de novos events. Missing data is represented by "NA". """ if mut_dict is None: mut_dict = load_mutation_rates() missense = de_novos["missense"] nonsense = de_novos["nonsense"] # load the set of transcripts that are the minimum set of transcripts # required to contain all the de novos, unless we can't find any coding # transcripts that contain the de novos. try: transcripts = await load_gene(ensembl, symbol, missense + nonsense) except IndexError as e: print(e) return None probs = {"miss_prob": [], "nons_prob": []} dists = {"miss_dist": [], "nons_dist": []} for transcript in transcripts: missense_events = get_de_novos_in_transcript(transcript, missense) nonsense_events = get_de_novos_in_transcript(transcript, nonsense) rates = SiteRates(transcript, mut_dict) (miss_dist, miss_prob) = get_p_value(transcript, rates, iterations, "missense", missense_events) (nons_dist, nons_prob) = get_p_value(transcript, rates, iterations, "lof", nonsense_events) dists["miss_dist"].append(miss_dist) dists["nons_dist"].append(nons_dist) probs["miss_prob"].append(miss_prob) probs["nons_prob"].append(nons_prob) # remove the de novos analysed in the current transcript, so that # analysis of subsequent transcripts uses independent events. NOTE THAT # THIS MIGHT MISS SOME CLUSTERING ACROSS MUTUALLY EXCLUSIVE TRANSCRIPTS # IF THE DE NOVO EVENTS ARE NEAR THE TRANSCRIPT DIVERGENCE. missense = [x for x in missense if x not in missense_events] nonsense = [x for x in nonsense if x not in nonsense_events] for key in dists: dists[key] = ",".join([ str(x) for x in dists[key] ]) probs = {k: fishers_method(probs[k]) for k in probs} probs.update(dists) return probs
def get_rates(self, tx): # load the sequence contect mutation rates, then assess each site in the # CDS. mut_dict = load_mutation_rates() return SiteRates(tx, mut_dict)
def cluster_de_novos(symbol, de_novos, iterations=1000000, ensembl=None, mut_dict=None): """ analysis proximity cluster of de novos in a single gene Args: symbol: HGNC symbol for a gene de_novos: dictionary of de novo positions for the HGNC gene, indexed by functional type iterations: number of simulations to run ensembl: EnsemblRequest object, for obtaing info from ensembl mut_dict: dictionary of mutation rates, indexed by trinuclotide sequence Returns: a dictionary containing P values, and distances for missense, nonsense, and synonymous de novos events. Missing data is represented by "NA". """ if ensembl is None: ensembl = EnsemblRequest('cache', 'grch37') if mut_dict is None: mut_dict = load_mutation_rates() missense = de_novos["missense"] nonsense = de_novos["nonsense"] # load the set of transcripts that are the minimum set of transcripts # required to contain all the de novos, unless we can't find any coding # transcripts that contain the de novos. try: transcripts = load_gene(ensembl, symbol, missense + nonsense) except IndexError as e: print(e) return None probs = {"miss_prob": [], "nons_prob": []} dists = {"miss_dist": [], "nons_dist": []} for transcript in transcripts: missense_events = get_de_novos_in_transcript(transcript, missense) nonsense_events = get_de_novos_in_transcript(transcript, nonsense) rates = SiteRates(transcript, mut_dict) (miss_dist, miss_prob) = get_p_value(transcript, rates, iterations, "missense", missense_events) (nons_dist, nons_prob) = get_p_value(transcript, rates, iterations, "lof", nonsense_events) dists["miss_dist"].append(miss_dist) dists["nons_dist"].append(nons_dist) probs["miss_prob"].append(miss_prob) probs["nons_prob"].append(nons_prob) # remove the de novos analysed in the current transcript, so that # analysis of subsequent transcripts uses independent events. NOTE THAT # THIS MIGHT MISS SOME CLUSTERING ACROSS MUTUALLY EXCLUSIVE TRANSCRIPTS # IF THE DE NOVO EVENTS ARE NEAR THE TRANSCRIPT DIVERGENCE. missense = [x for x in missense if x not in missense_events] nonsense = [x for x in nonsense if x not in nonsense_events] for key in dists: dists[key] = ",".join([ str(x) for x in dists[key] ]) probs = combine_p_values(probs) probs.update(dists) return probs
def setUpClass(cls): cls.temp_dir = tempfile.mkdtemp() cls.ensembl = EnsemblRequest(cls.temp_dir, 'grch37') cls.mut_dict = load_mutation_rates()