class EnsemblAnnotation(object): """ Class for building an annotation file for MAVIS in json format. Args: species (str): species of interest release (int): Ensembl release to use output (str): path to output file best_file (str): path to file of "best transcripts" alias_file (str): path to file with gene aliases """ def __init__(self, release, species, output, best_file=None, alias_file=None, custom_cache=None): self.annotation = {} self.custom_cache = custom_cache self.cache_prefix = None self.gen_time = get_date() self.release = release self.species = species self.output = output self.best_file = best_file self.alias_file = alias_file if self.alias_file: self.alias = parse_alias_file(self.alias_file) else: self.alias = defaultdict(set) self.data = EnsemblRelease(release, species) self.download_pyensembl_cache() self.get_domain_cache() if self.best_file: self.best = parse_best_file(self.best_file) else: self.best = self.choose_best_transcripts() self.build_json() def download_pyensembl_cache(self): """ Method download the pyensembl cache files for this release if not already there. Args: data (EnsemblRelease): pyensembl object for the release info custom_cache (str): path to cirectory to cache pyensembl files """ if self.custom_cache: os.environ["PYENSEMBL_CACHE_DIR"] = self.custom_cache self.data.download() self.data.index() self.cache_prefix = self.data.gtf_path.split("gtf.gz")[0] def get_domain_cache(self): global DOMAIN_CACHE_PATH DOMAIN_CACHE_PATH = self.cache_prefix + "domain.tsv" parse_cached_domains() def get_genes(self, eid): """ Method parse gene info in the EnsemblRelease into json format. Args: eid (str): Ensembl gene ID Returns: dict: gene info formatted for json """ gene = self.data.gene_by_id(eid) result = { "name": str(gene.gene_id), "chr": str(gene.contig), "start": int(gene.start), "end": int(gene.end), "strand": str(gene.strand), "aliases": [str(gene.gene_name)] + list(self.alias[gene.gene_id]), "transcripts": [], } return result def get_transcripts(self, eid): """ Method parse transcript info in the EnsemblRelease into json format. Ignore non-coding transcripts. Args: eid (str): Ensembl transcript ID Returns: dict: transcript info formatted for json """ transcript = self.data.transcript_by_id(eid) protein_id = transcript.protein_id if not protein_id: return None result = { "name": str(transcript.transcript_id), "start": int(transcript.start), "end": int(transcript.end), "aliases": [str(transcript.transcript_name)], "is_best_transcript": str(transcript.transcript_id) in self.best, "protein_id": transcript.protein_id, "exons": [], "domains": [], } # start/end are absolute genomic positions, so calculate positions relative to the mRNA start cpos = transcript.coding_sequence_position_ranges if transcript.strand in ("+", "1"): result["cdna_coding_start"] = transcript.spliced_offset( cpos[0][0]) + 1 result["cdna_coding_end"] = transcript.spliced_offset( cpos[-1][1]) + 1 elif transcript.strand in ("-", "-1"): result["cdna_coding_start"] = transcript.spliced_offset( cpos[0][1]) + 1 result["cdna_coding_end"] = transcript.spliced_offset( cpos[-1][0]) + 1 return result def get_exons(self, eid): """ Method parse exon info in the EnsemblRelease into json format. Args: eid (str): Ensembl exon ID Returns: dict: exon info formatted for json """ exon = self.data.exon_by_id(eid) result = { "name": str(exon.exon_id), "start": int(exon.start), "end": int(exon.end) } return result @cached_domains def get_domains(self, eid): """ Method request domain info from Ensembl and parse into json format. Args: eid (str): Ensembl protein ID Returns: list: a list of domains formatted for json """ temp = {} protein = request_ensembl_protein(eid) for domain in protein: name = str(domain["id"]) desc = (str(domain["description"]).replace('"', "").replace("'", "") ) # quotes causing errors when mavis loads json region = {"start": int(domain["start"]), "end": int(domain["end"])} if desc == "": continue if name in temp: temp[name]["regions"].append(region) else: temp[name] = {"name": name, "desc": desc, "regions": [region]} domain_list = list(temp.values()) return domain_list def build_json(self): """ Method compile a json object for MAVIS of all protein coding genes and associated info for the indicated species. Returns: dict: a json-formatted set of annotations for use with MAVIS """ count = {"gene": 0, "transcript": 0, "non_coding": 0} self.annotation["script"] = SCRIPT self.annotation["script_version"] = VERSION self.annotation["gene_alias_file"] = self.alias_file self.annotation["best_transcript_file"] = self.best_file self.annotation["ensembl_version"] = self.release self.annotation["generation_time"] = self.gen_time self.annotation["genes"] = [] gene_ids = self.data.gene_ids() for index, gid in enumerate(gene_ids): print("{}/{} genes".format(index, len(gene_ids))) gened = self.get_genes(gid) count["gene"] += 1 for tid in self.data.transcript_ids_of_gene_id(gid): transd = self.get_transcripts(tid) count["transcript"] += 1 if transd: for eid in self.data.exon_ids_of_transcript_id(tid): exond = self.get_exons(eid) transd["exons"].append(exond) domains = self.get_domains(transd["protein_id"]) transd["domains"] = domains gened["transcripts"].append(transd) else: count["non_coding"] += 1 if gened["transcripts"] != []: self.annotation["genes"].append(gened) print( "{gene:,} genes, {transcript:,} transcripts ({non_coding:,} non-coding transcripts, ignored)" .format(**count)) return self.annotation def dump_json(self): """ Method dump the annotations in json-format to the specified output file. """ with open(self.output, "w") as fh: json.dump(self.annotation, fh) def delete_cache(self): """ Method delete both the pyensembl and domain cache files. """ for cache_file in glob(self.cache_prefix + "*"): print("Removing cache file", cache_file) os.remove(cache_file) def choose_best_transcripts(self): """ Select a canonical transcript for each human gene using Ensembl rules. For human, the canonical transcript for a gene is set according to the following hierarchy: - 1. Longest CCDS translation with no stop codons. - 2. If no (1), choose the longest Ensembl/Havana merged translation with no stop codons. - 3. If no (2), choose the longest translation with no stop codons. - 4. If no translation, choose the longest non-protein-coding transcript. See: http://uswest.ensembl.org/Help/Glossary?id=346 Returns: str: canonical Ensembl transcript ID (if any) """ def longest_ccds(transcripts): """Longest CCDS translation with no stop codons.""" longest = None longest_len = 0 for t in transcripts: if t.is_protein_coding: if len(t.protein_sequence) > longest_len: longest = t longest_len = len(t.protein_sequence) return longest def longest_translation(transcript): """Longest translation with no stop codons.""" longest = None longest_len = 0 for t in transcripts: if t.contains_start_codon: start = t.start_codon_positions[0] if t.contains_stop_codon: stop = t.stop_codon_positions[2] else: stop = t.end if stop - start > longest_len: longest = t longest_len = stop - start return longest def longest_transcript(transcripts): """Longest transcript.""" longest = None longest_len = 0 for t in transcripts: if t.end - t.start > longest_len: longest = t longest_len = t.end - t.start return longest best = set() # Ensembl rules for canonical transcripts only apply to humans species = self.data.species.latin_name if species != "homo_sapiens": print( "Unable to choose canonical transcripts for {}. You can specify canonical transcripts with '--best-transcript-file'" .format(species)) return best else: print("Selecting a canoncial transcript for each gene") for gene_id in self.data.gene_ids(): transcripts = [ self.data.transcript_by_id(transcript_id) for transcript_id in self.data.transcript_ids_of_gene_id(gene_id) ] selected = (longest_ccds(transcripts) or longest_translation(transcripts) or longest_transcript(transcripts)) if selected: best.add(selected.transcript_id) return best
class LlamaEnsembl(object): """ Ensembl tools """ def __init__(self, genome='hg19'): if genome == 'hg19': self.version = 75 self.rest_url = "http://grch37.rest.ensembl.org" else: self.version = 77 self.rest_url = "http://rest.ensembl.org" self.db = EnsemblRelease(self.version) def rest_call(self, ext, data=None): if data: headers = { "Content-Type": "application/json", "Accept": "application/json" } r = requests.post(self.rest_url + ext, headers=headers, data=data) else: headers = {"Content-Type": "application/json"} r = requests.get(self.rest_url + ext, headers=headers) if not r.ok: r.raise_for_status() sys.exit() decoded = r.json() # print(repr(decoded)) return decoded def load_ensembl_ref(self, rid=None): """ Download, load, and index ensembl data """ self.db.download(self.version) self.db.index() if rid is not None: return self.db.transcript_by_id(rid) else: return None def get_exon_numbers(self, gene): """ This creates exon areas from the biggest transcript """ dct = {'start': [], 'id': [], 'stop': [], 'transcript': []} gene_id = self.db.gene_ids_of_gene_name(gene)[0] transcripts = self.db.transcript_ids_of_gene_id(gene_id) longest = 0 e = None for trans in transcripts: tsc = self.db.exon_ids_of_transcript_id(trans) tsize = len(tsc) if tsize > longest: longest = tsize e = tsc longest_transcript = trans for exid in e: exon = self.db.exon_by_id(exid) dct['start'].append(exon.start) dct['stop'].append(exon.end) dct['id'].append(exid) dct['transcript'].append(longest_transcript) df = pd.DataFrame(dct) df['number'] = df.index + 1 return df def get_genes(self, chrom, start, stop): if isinstance(chrom, str): chrom = chrom.replace('chr', '') return [ gobj.gene_name for gobj in self.db.genes_at_locus(chrom, start, stop) ] def get_gene_pos(self, gene): gene_id = self.db.gene_ids_of_gene_name(gene)[0] result = self.db.gene_by_id(gene_id) return result.contig, result.start, result.end # Rest client calls def get_rsids(self, rsids): ext = "/variation/homo_sapiens" data = {"ids": rsids} return self.rest_call(ext, json.dumps(data)) def get_cds_region(self, transcript, position): """ get location of variant to """ ext = "/variation/human/{}:{}?".format(transcript, position) try: mappings = self.rest_call(ext)['mappings'][0] except requests.exceptions.HTTPError: return '', '', '' return mappings['seq_region_name'], mappings['start'], mappings['end'] def parse_ref_exons(self, chrom, start, stop, gene=None, tx_col=None): """ Return fasta reference with only the sequences needed""" ens_db = self.db if isinstance(chrom, str): chrom = chrom.replace('chr', '') try: exons = ens_db.exons_at_locus(chrom, start, stop) except ValueError as e: # Load pyensembl db raise e if not len(exons): return '', '' exon_numbers = self.get_exon_numbers(exons[0].gene_name) transcript = exon_numbers['transcript'].values[0] trx_exons = [] for ex in exons: nrow = exon_numbers[exon_numbers['id'] == ex.exon_id] if nrow.shape[0] > 0: trx_exons.extend(nrow['number'].values) return transcript, ','.join([str(number) for number in trx_exons]) # Annotate DataFrames def annotate_dataframe(self, df, chrom_col='CHROM', start_col='START', end_col='END', gene_col=None, tx_col=None): genes = [] exons = [] transcripts = [] for i, row in df.iterrows(): genes_row = self.get_genes(row[chrom_col], row[start_col], row[end_col]) if gene_col: if row[gene_col] in genes_row: genes_row = [row[gene_col]] else: print( 'Warning!! {} not found for {}:{}-{} in row {}'.format( row[gene_col], row[chrom_col], row[start_col], row[end_col], i)) genes.append(','.join(genes_row)) if len(genes_row) == 1 or tx_col: trans_row, exons_row = self.parse_ref_exons( row[chrom_col], row[start_col], row[end_col], gene=genes_row[0], tx_col=tx_col ) # TODO - add fucntionality to choose gene and transcript elif len(genes_row) == 0: trans_row, exons_row = self.parse_ref_exons(row[chrom_col], row[start_col], row[end_col], tx_col=tx_col) else: trans_row = '' exons_row = '' exons.append(exons_row) transcripts.append(trans_row) new_df = pd.DataFrame( { 'genes': genes, 'exons': exons, 'transcript': transcripts }, index=df.index) return new_df def annotate_variants(self, rsid_array, extra_cols=[]): """ Get chom:start-end for a list of variants """ result = { 'chrom': [], 'start': [], 'end': [], 'rsid': [], 'allele': [], 'vartype': [], 'consequence': [] } for extra in extra_cols: result[extra] = [] response = self.get_rsids(rsid_array) for var in rsid_array: if var not in response: continue mapping = response[var]['mappings'][0] result['chrom'].append(mapping['seq_region_name']) result['start'].append(mapping['start']) result['end'].append(mapping['end']) result['rsid'].append(var) result['allele'].append(mapping['allele_string']) result['vartype'].append(response[var]['var_class']) result['consequence'].append( response[var]['most_severe_consequence']) for extra in extra_cols: result[extra].append(response[var][extra]) return pd.DataFrame(result) def annotate_cds_regions(self, df, tx_col='NM', cds_col='MutationName'): chroms = [] starts = [] ends = [] for _, row in df.iterrows(): location = self.get_cds_region(row[tx_col], row[cds_col]) chroms.append(location[0]) starts.append(location[1]) ends.append(location[2]) df['chrom'] = chroms df['start'] = starts df['end'] = ends return df