class LlamaEnsembl(object): """ Ensembl tools """ def __init__(self, genome='hg19'): if genome == 'hg19': self.version = 75 self.rest_url = "http://grch37.rest.ensembl.org" else: self.version = 77 self.rest_url = "http://rest.ensembl.org" self.db = EnsemblRelease(self.version) def rest_call(self, ext, data=None): if data: headers = { "Content-Type": "application/json", "Accept": "application/json" } r = requests.post(self.rest_url + ext, headers=headers, data=data) else: headers = {"Content-Type": "application/json"} r = requests.get(self.rest_url + ext, headers=headers) if not r.ok: r.raise_for_status() sys.exit() decoded = r.json() # print(repr(decoded)) return decoded def load_ensembl_ref(self, rid=None): """ Download, load, and index ensembl data """ self.db.download(self.version) self.db.index() if rid is not None: return self.db.transcript_by_id(rid) else: return None def get_exon_numbers(self, gene): """ This creates exon areas from the biggest transcript """ dct = {'start': [], 'id': [], 'stop': [], 'transcript': []} gene_id = self.db.gene_ids_of_gene_name(gene)[0] transcripts = self.db.transcript_ids_of_gene_id(gene_id) longest = 0 e = None for trans in transcripts: tsc = self.db.exon_ids_of_transcript_id(trans) tsize = len(tsc) if tsize > longest: longest = tsize e = tsc longest_transcript = trans for exid in e: exon = self.db.exon_by_id(exid) dct['start'].append(exon.start) dct['stop'].append(exon.end) dct['id'].append(exid) dct['transcript'].append(longest_transcript) df = pd.DataFrame(dct) df['number'] = df.index + 1 return df def get_genes(self, chrom, start, stop): if isinstance(chrom, str): chrom = chrom.replace('chr', '') return [ gobj.gene_name for gobj in self.db.genes_at_locus(chrom, start, stop) ] def get_gene_pos(self, gene): gene_id = self.db.gene_ids_of_gene_name(gene)[0] result = self.db.gene_by_id(gene_id) return result.contig, result.start, result.end # Rest client calls def get_rsids(self, rsids): ext = "/variation/homo_sapiens" data = {"ids": rsids} return self.rest_call(ext, json.dumps(data)) def get_cds_region(self, transcript, position): """ get location of variant to """ ext = "/variation/human/{}:{}?".format(transcript, position) try: mappings = self.rest_call(ext)['mappings'][0] except requests.exceptions.HTTPError: return '', '', '' return mappings['seq_region_name'], mappings['start'], mappings['end'] def parse_ref_exons(self, chrom, start, stop, gene=None, tx_col=None): """ Return fasta reference with only the sequences needed""" ens_db = self.db if isinstance(chrom, str): chrom = chrom.replace('chr', '') try: exons = ens_db.exons_at_locus(chrom, start, stop) except ValueError as e: # Load pyensembl db raise e if not len(exons): return '', '' exon_numbers = self.get_exon_numbers(exons[0].gene_name) transcript = exon_numbers['transcript'].values[0] trx_exons = [] for ex in exons: nrow = exon_numbers[exon_numbers['id'] == ex.exon_id] if nrow.shape[0] > 0: trx_exons.extend(nrow['number'].values) return transcript, ','.join([str(number) for number in trx_exons]) # Annotate DataFrames def annotate_dataframe(self, df, chrom_col='CHROM', start_col='START', end_col='END', gene_col=None, tx_col=None): genes = [] exons = [] transcripts = [] for i, row in df.iterrows(): genes_row = self.get_genes(row[chrom_col], row[start_col], row[end_col]) if gene_col: if row[gene_col] in genes_row: genes_row = [row[gene_col]] else: print( 'Warning!! {} not found for {}:{}-{} in row {}'.format( row[gene_col], row[chrom_col], row[start_col], row[end_col], i)) genes.append(','.join(genes_row)) if len(genes_row) == 1 or tx_col: trans_row, exons_row = self.parse_ref_exons( row[chrom_col], row[start_col], row[end_col], gene=genes_row[0], tx_col=tx_col ) # TODO - add fucntionality to choose gene and transcript elif len(genes_row) == 0: trans_row, exons_row = self.parse_ref_exons(row[chrom_col], row[start_col], row[end_col], tx_col=tx_col) else: trans_row = '' exons_row = '' exons.append(exons_row) transcripts.append(trans_row) new_df = pd.DataFrame( { 'genes': genes, 'exons': exons, 'transcript': transcripts }, index=df.index) return new_df def annotate_variants(self, rsid_array, extra_cols=[]): """ Get chom:start-end for a list of variants """ result = { 'chrom': [], 'start': [], 'end': [], 'rsid': [], 'allele': [], 'vartype': [], 'consequence': [] } for extra in extra_cols: result[extra] = [] response = self.get_rsids(rsid_array) for var in rsid_array: if var not in response: continue mapping = response[var]['mappings'][0] result['chrom'].append(mapping['seq_region_name']) result['start'].append(mapping['start']) result['end'].append(mapping['end']) result['rsid'].append(var) result['allele'].append(mapping['allele_string']) result['vartype'].append(response[var]['var_class']) result['consequence'].append( response[var]['most_severe_consequence']) for extra in extra_cols: result[extra].append(response[var][extra]) return pd.DataFrame(result) def annotate_cds_regions(self, df, tx_col='NM', cds_col='MutationName'): chroms = [] starts = [] ends = [] for _, row in df.iterrows(): location = self.get_cds_region(row[tx_col], row[cds_col]) chroms.append(location[0]) starts.append(location[1]) ends.append(location[2]) df['chrom'] = chroms df['start'] = starts df['end'] = ends return df
def genes_at_locus(*args, **kwargs): genome = EnsemblRelease(ENSEMBL_RELEASE_VERSION) return genome.genes_at_locus(*args, **kwargs)