def find_transcripts(ensembl, mut_dict, output, args): de_novos = load_de_novos(args.de_novos) output.write("hgnc_symbol\ttranscript_id\tlength\tde_novos\n") for symbol in sorted(de_novos): print(symbol) func_events = de_novos[symbol]["missense"] + de_novos[symbol][ "nonsense"] # find the counts per transcript, depending on whether we want to count # for all transcripts containing one or more de novos, or to find the # minimum set of transcripts to contain the de novos try: if args.all_transcripts: counts = count_de_novos_per_transcript(ensembl, symbol, func_events) elif args.minimal_transcripts: counts = minimise_transcripts(ensembl, symbol, func_events) except (ValueError, IndexError): print("error occured with {0}".format(symbol)) continue # write the transcript details to a file for key in counts: line = "{}\t{}\t{}\t{}\n".format(symbol, key, counts[key]["len"], counts[key]["n"]) output.write(line)
def find_transcripts(ensembl, mut_dict, output, args): de_novos = load_de_novos(args.de_novos) output.write("hgnc_symbol\ttranscript_id\tlength\tde_novos\n") for symbol in sorted(de_novos): print(symbol) func_events = de_novos[symbol]["missense"] + de_novos[symbol]["nonsense"] # find the counts per transcript, depending on whether we want to count # for all transcripts containing one or more de novos, or to find the # minimum set of transcripts to contain the de novos try: if args.all_transcripts: counts = count_de_novos_per_transcript(ensembl, symbol, func_events) elif args.minimal_transcripts: counts = minimise_transcripts(ensembl, symbol, func_events) except (ValueError, IndexError): print("error occured with {0}".format(symbol)) continue # write the transcript details to a file for key in counts: line = "{}\t{}\t{}\t{}\n".format(symbol, key, counts[key]["len"], counts[key]["n"]) output.write(line)
def test_minimise_transcripts(self): """ test that minimise_transcripts() works correctly """ # run through a test case for a single gene hgnc = "DYNLL1" sites = [120934226, 120936012] counts = minimise_transcripts(self.ensembl, hgnc, sites) expected = {'ENST00000242577': {'len': 89, 'n': 2}, 'ENST00000392508': {'len': 89, 'n': 2}, 'ENST00000392509': {'len': 89, 'n': 2}, 'ENST00000549989': {'len': 89, 'n': 2}, 'ENST00000548342': {'len': 89, 'n': 2}} self.assertEqual(counts, expected) # check that when we don't have any de novos, we return an empty list self.assertEqual(minimise_transcripts(self.ensembl, hgnc, []), {}) # check that when none of the de novos are in a transcript, we return # an empty list. self.assertEqual(minimise_transcripts(self.ensembl, hgnc, [100]), {})
def main(): input_file, output_file, old_gene_id_file, cache_dir, genome_build, \ all_transcripts, minimal_transcripts = get_options() # load all the data ensembl = EnsemblRequest(cache_dir, genome_build) old_gene_ids = {} if old_gene_id_file is not None: old_gene_ids = get_deprecated_gene_ids(old_gene_id_file) known_de_novos = load_de_novos(input_file, exclude_indels=False) output = open(output_file, "w") output.write("hgnc_symbol\ttranscript_id\tlength\tde_novos\n") for gene_id in sorted(known_de_novos): de_novos = known_de_novos[gene_id] func_events = de_novos["missense"] + de_novos["nonsense"] # fix HGNC IDs that have been discontinued in favour of other gene IDs if gene_id in old_gene_ids: gene_id = old_gene_ids[gene_id] # find the counts per transcript, depending on whether we want to count # for all transcripts containing one or more de novos, or to find the # minimum set of transcripts to contain the de novos try: if all_transcripts: counts = count_de_novos_per_transcript(ensembl, gene_id, func_events) elif minimal_transcripts: counts = minimise_transcripts(ensembl, gene_id, func_events) except (ValueError, IndexError): print("error occured with {0}".format(gene_id)) continue # write the transcript details to a file for key in counts: line = "{0}\t{1}\t{2}\t{3}\n".format(gene_id, key, counts[key]["len"], counts[key]["n"]) output.write(line) output.close()