def __init__(self, src_file, title='ENSEMBL', version='', tx_mode=TranscriptProvider.TX_MODE_CANONICAL, protocol="file", is_thread_safe=False, tx_filter="dummy", custom_canonical_txs=None): super(EnsemblTranscriptDatasource, self).__init__(src_file=src_file, title=title, version=version) ensembl_index_fname = src_file + ".transcript.idx" ensembl_gene_to_transcript_index_fname = src_file + ".transcript_by_gene.idx" ensembl_genomic_position_bins_to_transcript_index_fname = src_file + ".transcript_by_gp_bin.idx" # Seconds before a cache entry should be cleared out timeout = 1000 max_entries = 25000 cache_protocol = "memory" if not is_thread_safe: logging.getLogger(__name__).info("%s %s is being set up in faster, NOT thread-safe mode (for annotation). " % (title, version)) cache_protocol = "simple" # Contains a key of transcript id and value of a Transcript class, with sequence data where possible. # By specifying "memory" for the cache, this is thread safe. Otherwise, use "simple" self.transcript_db = shove.Shove(protocol + '://%s' % ensembl_index_fname, cache_protocol + "://", timeout=timeout, max_entries=max_entries) self.gene_db = shove.Shove(protocol + '://%s' % ensembl_gene_to_transcript_index_fname, cache_protocol + "://", timeout=timeout, max_entries=max_entries) self.gp_bin_db = shove.Shove(protocol + '://%s' % ensembl_genomic_position_bins_to_transcript_index_fname, cache_protocol + "://", timeout=timeout, max_entries=max_entries) tmp = self.gp_bin_db.keys() logging.getLogger(__name__).info("%s %s is being set up with default tx-mode: %s. " % (title, version, tx_mode)) self.set_tx_mode(tx_mode) logging.getLogger(__name__).info("%s %s is being set up with %s filtering. " % (title, version, tx_filter)) self._tx_filter = TranscriptFilterFactory.create_instance(tx_filter) self._hgvs_xformer = HgvsChangeTransformer() # Store a list of the custom canonical transcripts self._custom_canonical_txs = custom_canonical_txs or []
def __init__(self, src_file, title='ENSEMBL', version='', tx_mode=TranscriptProvider.TX_MODE_CANONICAL, protocol="file", is_thread_safe=False, tx_filter="dummy"): super(EnsemblTranscriptDatasource, self).__init__(src_file=src_file, title=title, version=version) ensembl_index_fname = src_file + ".transcript.idx" ensembl_gene_to_transcript_index_fname = src_file + ".transcript_by_gene.idx" ensembl_genomic_position_bins_to_transcript_index_fname = src_file + ".transcript_by_gp_bin.idx" # Seconds before a cache entry should be cleared out timeout = 1000 max_entries = 25000 cache_protocol = "memory" if not is_thread_safe: logging.getLogger(__name__).info("%s %s is being set up in faster, NOT thread-safe mode (for annotation). " % (title, version)) cache_protocol = "simple" # Contains a key of transcript id and value of a Transcript class, with sequence data where possible. # By specifying "memory" for the cache, this is thread safe. Otherwise, use "simple" self.transcript_db = shove.Shove(protocol + '://%s' % ensembl_index_fname, cache_protocol + "://", timeout=timeout, max_entries=max_entries) self.gene_db = shove.Shove(protocol + '://%s' % ensembl_gene_to_transcript_index_fname, cache_protocol + "://", timeout=timeout, max_entries=max_entries) self.gp_bin_db = shove.Shove(protocol + '://%s' % ensembl_genomic_position_bins_to_transcript_index_fname, cache_protocol + "://", timeout=timeout, max_entries=max_entries) tmp = self.gp_bin_db.keys() logging.getLogger(__name__).info("%s %s is being set up with default tx-mode: %s. " % (title, version, tx_mode)) self.set_tx_mode(tx_mode) logging.getLogger(__name__).info("%s %s is being set up with %s filtering. " % (title, version, tx_filter)) self._tx_filter = TranscriptFilterFactory.create_instance(tx_filter) self._hgvs_xformer = HgvsChangeTransformer()
class EnsemblTranscriptDatasource(TranscriptProvider, Datasource, SegmentDatasource): """ Similar to a GAF datasource, but uses ensembl transcripts. Also, supports gencode Though all transcripts for GENCODE can be loaded, it is currently set to ignore any transcripts that are not "basic" Any transcripts in the custom canonical transcript list will be selected before the tx-mode parameter. """ """This is the list of annotations that get populated by this datasource""" POPULATED_ANNOTATION_NAMES = { 'transcript_exon', 'variant_type', 'variant_classification', 'other_transcripts', 'gene', 'gene_id', 'annotation_transcript', 'genome_change', 'transcript_id', 'secondary_variant_classification', 'protein_change', 'codon_change', 'transcript_change', 'transcript_strand', 'gene', 'gene_type', 'gencode_transcript_tags', 'gencode_transcript_status', 'havana_transcript', 'ccds_id', 'gencode_transcript_type', 'transcript_position', 'gencode_transcript_name' } def __init__(self, src_file, title='ENSEMBL', version='', tx_mode=TranscriptProvider.TX_MODE_CANONICAL, protocol="file", is_thread_safe=False, tx_filter="dummy", custom_canonical_txs=None): super(EnsemblTranscriptDatasource, self).__init__(src_file=src_file, title=title, version=version) ensembl_index_fname = src_file + ".transcript.idx" ensembl_gene_to_transcript_index_fname = src_file + ".transcript_by_gene.idx" ensembl_genomic_position_bins_to_transcript_index_fname = src_file + ".transcript_by_gp_bin.idx" # Seconds before a cache entry should be cleared out timeout = 1000 max_entries = 25000 cache_protocol = "memory" if not is_thread_safe: logging.getLogger(__name__).info( "%s %s is being set up in faster, NOT thread-safe mode (for annotation). " % (title, version)) cache_protocol = "simple" # Contains a key of transcript id and value of a Transcript class, with sequence data where possible. # By specifying "memory" for the cache, this is thread safe. Otherwise, use "simple" self.transcript_db = shove.Shove(protocol + '://%s' % ensembl_index_fname, cache_protocol + "://", timeout=timeout, max_entries=max_entries) self.gene_db = shove.Shove( protocol + '://%s' % ensembl_gene_to_transcript_index_fname, cache_protocol + "://", timeout=timeout, max_entries=max_entries) self.gp_bin_db = shove.Shove( protocol + '://%s' % ensembl_genomic_position_bins_to_transcript_index_fname, cache_protocol + "://", timeout=timeout, max_entries=max_entries) tmp = self.gp_bin_db.keys() logging.getLogger(__name__).info( "%s %s is being set up with default tx-mode: %s. " % (title, version, tx_mode)) self.set_tx_mode(tx_mode) logging.getLogger(__name__).info( "%s %s is being set up with %s filtering. " % (title, version, tx_filter)) self._tx_filter = TranscriptFilterFactory.create_instance(tx_filter) self._hgvs_xformer = HgvsChangeTransformer() # Store a list of the custom canonical transcripts self._custom_canonical_txs = custom_canonical_txs or [] # IMPORTANT: Any new attributes that can change the results of annotations and, therefore, should invalidate the # cache, should be added to the list in get_hashcode. There should be a way to do this dynamically, but that # has not been implemented yet. def set_tx_mode(self, tx_mode): if tx_mode == TranscriptProvider.TX_MODE_CANONICAL: logging.getLogger(__name__).warn( "Attempting to set transcript mode of CANONICAL for ensembl. This operation is only supported for GENCODE. Otherwise, will be the same as EFFECT." ) self.tx_mode = tx_mode def _create_basic_annotation(self, value): return Annotation(value=value, datasourceName=self.title) def _create_blank_set_of_annotations(self): final_annotation_dict = dict() for k in EnsemblTranscriptDatasource.POPULATED_ANNOTATION_NAMES: final_annotation_dict[k] = self._create_basic_annotation('') return final_annotation_dict def _retrieve_gencode_tag_value(self, tx, attribute_name): """ If transcript is not gencode, no error is thrown. Just a blank value. Note that gencode have other attributes, but not plain ol' ENSEMBL :param tx: :param attribute_name: :return: "" if other attributes are not present. "" if specified tag is not present. Otherwise, tag value. """ attribute_dict = tx.get_other_attributes() if attribute_dict is None: return "" return str(attribute_dict.get(attribute_name, "")) def get_transcripts_by_pos(self, chr, start, end, padding=3000): """ Returns filtered list of transcripts that overlap the given genomic position. :rtype : list :param str chr: :param str|int start: :param str|int end: """ txs_unfiltered = self.get_overlapping_transcripts(chr, start, end, padding=padding) txs = self._filter_transcripts(txs_unfiltered) return txs def _create_hgvs_dict(self, chosen_tx, mutation): hgvs_dict = dict() if self._hgvs_xformer is not None: hgvs_dict = self._hgvs_xformer.hgvs_annotate_mutation_given_tx( mutation, chosen_tx) return hgvs_dict def _create_hgvs_annotation_dict(self, mutation, chosen_tx): hgvs_dict_keys = HgvsChangeTransformer.HEADERS hgvs_dict = self._create_hgvs_dict(chosen_tx, mutation) hgvs_dict_annotations = dict() for k in hgvs_dict_keys: hgvs_dict_annotations[k] = self._create_basic_annotation( hgvs_dict.get(k, "")) return hgvs_dict_annotations def annotate_mutation(self, mutation): chr = mutation.chr start = int(mutation.start) end = int(mutation.end) txs = self.get_transcripts_by_pos(chr, start, end) final_annotation_dict = self._create_blank_set_of_annotations() final_annotation_dict['variant_type'] = Annotation( value=TranscriptProviderUtils.infer_variant_type( mutation.ref_allele, mutation.alt_allele), datasourceName=self.title) chosen_tx = None # We have hit IGR if no transcripts come back. Most annotations can just use the blank set. if len(txs) == 0: final_annotation_dict[ 'variant_classification'] = self._create_basic_annotation( VariantClassification.IGR) nearest_genes = self._get_nearest_genes(chr, int(start), int(end)) final_annotation_dict[ 'other_transcripts'] = self._create_basic_annotation( value='%s (%s upstream) : %s (%s downstream)' % (nearest_genes[0][0], nearest_genes[0][1], nearest_genes[1][0], nearest_genes[1][1])) final_annotation_dict['gene'] = self._create_basic_annotation( 'Unknown') final_annotation_dict['gene_id'] = self._create_basic_annotation( '0') final_annotation_dict[ 'genome_change'] = self._create_basic_annotation( TranscriptProviderUtils.determine_genome_change( mutation.chr, mutation.start, mutation.end, mutation.ref_allele, mutation.alt_allele, final_annotation_dict['variant_type'].value)) else: # Choose the best effect transcript chosen_tx = self._choose_transcript( txs, self.get_tx_mode(), final_annotation_dict['variant_type'].value, mutation.ref_allele, mutation.alt_allele, start, end) vcer = VariantClassifier() final_annotation_dict[ 'annotation_transcript'] = self._create_basic_annotation( chosen_tx.get_transcript_id()) final_annotation_dict[ 'genome_change'] = self._create_basic_annotation( TranscriptProviderUtils.determine_genome_change( mutation.chr, mutation.start, mutation.end, mutation.ref_allele, mutation.alt_allele, final_annotation_dict['variant_type'].value)) final_annotation_dict[ 'transcript_position'] = self._create_basic_annotation( TranscriptProviderUtils.render_transcript_position( int(start), int(end), chosen_tx)) final_annotation_dict[ 'transcript_id'] = self._create_basic_annotation( chosen_tx.get_transcript_id()) variant_classfication = vcer.variant_classify( tx=chosen_tx, variant_type=final_annotation_dict['variant_type'].value, ref_allele=mutation.ref_allele, alt_allele=mutation.alt_allele, start=mutation.start, end=mutation.end) final_annotation_dict[ 'transcript_exon'] = self._create_basic_annotation( str(variant_classfication.get_exon_i() + 1)) final_annotation_dict[ 'variant_classification'] = self._create_basic_annotation( variant_classfication.get_vc()) final_annotation_dict[ 'secondary_variant_classification'] = self._create_basic_annotation( variant_classfication.get_secondary_vc()) final_annotation_dict[ 'protein_change'] = self._create_basic_annotation( vcer.generate_protein_change_from_vc( variant_classfication)) final_annotation_dict[ 'codon_change'] = self._create_basic_annotation( vcer.generate_codon_change_from_vc(chosen_tx, start, end, variant_classfication)) final_annotation_dict[ 'transcript_change'] = self._create_basic_annotation( vcer.generate_transcript_change_from_tx( chosen_tx, final_annotation_dict['variant_type'].value, variant_classfication, start, end, mutation.ref_allele, mutation.alt_allele)) final_annotation_dict[ 'transcript_strand'] = self._create_basic_annotation( chosen_tx.get_strand()) final_annotation_dict['gene'] = self._create_basic_annotation( chosen_tx.get_gene()) final_annotation_dict['gene_type'] = self._create_basic_annotation( chosen_tx.get_gene_type()) final_annotation_dict[ 'gencode_transcript_tags'] = self._create_basic_annotation( self._retrieve_gencode_tag_value(chosen_tx, 'tag')) final_annotation_dict[ 'gencode_transcript_status'] = self._create_basic_annotation( self._retrieve_gencode_tag_value(chosen_tx, 'transcript_status')) final_annotation_dict[ 'havana_transcript'] = self._create_basic_annotation( self._retrieve_gencode_tag_value(chosen_tx, 'havana_transcript')) final_annotation_dict['ccds_id'] = self._create_basic_annotation( self._retrieve_gencode_tag_value(chosen_tx, 'ccdsid')) final_annotation_dict[ 'gencode_transcript_type'] = self._create_basic_annotation( self._retrieve_gencode_tag_value(chosen_tx, 'transcript_type')) final_annotation_dict[ 'gencode_transcript_name'] = self._create_basic_annotation( self._retrieve_gencode_tag_value(chosen_tx, 'transcript_name')) other_transcript_value = self._render_other_transcripts( txs, [txs.index(chosen_tx)], final_annotation_dict['variant_type'].value, mutation.ref_allele, mutation.alt_allele, mutation.start, mutation.end) final_annotation_dict[ 'other_transcripts'] = self._create_basic_annotation( other_transcript_value) # final_annotation_dict['gene_id'].value mutation.addAnnotations(final_annotation_dict) # Add the HGVS annotations ... setting to "" if not available. hgvs_dict_annotations = self._create_hgvs_annotation_dict( mutation, chosen_tx) mutation.addAnnotations(hgvs_dict_annotations) return mutation def _filter_transcripts(self, txs): return self._tx_filter.filter(txs) def _choose_transcript(self, txs, tx_mode, variant_type, ref_allele, alt_allele, start, end): """Given a list of transcripts and a transcript mode (e.g. CANONICAL), choose the transcript to use. :param list txs: a list of transcripts that presumably overlap the variant :param tx_mode: :param str variant_type: :param str ref_allele: :param str alt_allele: :param start: :param end: :return Transcript : chosen transcript given tx-mode """ if len(txs) == 1: return txs[0] if tx_mode == TranscriptProvider.TX_MODE_CANONICAL: return self._choose_canonical_transcript(txs, variant_type, ref_allele, alt_allele, start, end) return self._choose_best_effect_transcript(txs, variant_type, ref_allele, alt_allele, start, end) @staticmethod def _get_best_scores(txs, scoring_function, comparator): scores = {tx: scoring_function(tx) for tx in txs} best = comparator(scores.itervalues()) return [k for (k, v) in scores.iteritems() if v == best] @staticmethod def _select_best_with_multiple_criteria(txs, scoring_functions): """Sort using multiple scoring functions :param txs: transcripts to sort :param scoring_functions: a tuple of the form ( tx -> B, [B] -> B) an example is (lambda x: x.seq_len(), min) which will apply seq_len to each transcript, and then use the minimum resulting length """ best_txs = txs for (f, cmp) in scoring_functions: if len(best_txs) == 1: return best_txs else: best_txs = EnsemblTranscriptDatasource._get_best_scores( best_txs, f, cmp) return best_txs @staticmethod def _calculate_canonical_score(tx): """ For score, higher is better. "Protein coding" is worth four points. 4 - (gencode transcript level) are added. For example, a protein coding level 2 (automatic curation) transcript will score 6. For example, a protein coding level 1 (automatic and manual curation) transcript will score 7. Level 1 is validated Level 2 is manual annotation Level 3 is automated annotation. :param tx: Transcript :return int: score as described above """ # higher ranks are more important. lvl_rank = 0 lvl = tx.get_other_attributes().get('level', [None])[0] if lvl is None: lvl_score = 0 else: lvl_score = 4 - int(lvl) type_rank = 2 type_score = 0 if tx.get_gene_type() == "protein_coding": type_score = 1 return (lvl_score << lvl_rank) + (type_score << type_rank) @staticmethod def _get_appris_rank(tx): """Get the appris ranking from the transcript's tag field""" appris_ranks = TranscriptProviderUtils.APPRIS_RANKING_DICT tags = set(tx.get_other_attributes().get('tag', "").split("|")) for tag in TranscriptProviderUtils.APPRIS_TAGS: if tag in tags: if "CCDS" in tags: if tag == "appris_candidate": return appris_ranks["appris_candidate_ccds"] elif tag in { 'appris_candidate_longest_seq', 'appris_candidate_longest' }: return appris_ranks["appris_candidate_longest_ccds"] return appris_ranks[tag] else: return TranscriptProviderUtils.NO_APPRIS_VALUE @staticmethod def _calculate_effect_score(tx, start, end, alt_allele, ref_allele, variant_type): """Compute the effect score""" effect_dict = TranscriptProviderUtils.retrieve_effect_dict() vcer = VariantClassifier() if (ref_allele == "" or ref_allele == "-") and (alt_allele == "" or alt_allele == "-"): vc = VariantClassification.SILENT else: vc = vcer.variant_classify(tx, ref_allele, alt_allele, start, end, variant_type).get_vc() effect_score = effect_dict.get(vc, 25) return effect_score @staticmethod def _is_tx_in_tx_list(tx, tx_id_list): """ Check if a transcript ID (with or without version number) is in the given list of transcripts. NOTE: The tx_id_list is assumed to not include version numbers. :param tx: :param tx_id_list: :return bool: """ tx_id = tx.get_transcript_id() if tx_id.rsplit('.', 1)[0] in tx_id_list: return True return False def _choose_best_effect_transcript(self, txs, variant_type, ref_allele, alt_allele, start, end): """Choose the transcript with the most detrimental effect. The rankings are in TranscriptProviderUtils. Ties are broken by which transcript has the longer coding length. Additional ties are broken with appris rank 0. membership in custom canonical transcript list 1. most detrimental effect 2. curation level 3. appris rank 4. longest protein change 5. lexicographical sort on transcript ID :param list txs: list of Transcript :param str variant_type: :param str ref_allele: :param str alt_allele: :param str start: :param str end: :return Transcript: """ if len(txs) == 0: return None best_effect_txs = EnsemblTranscriptDatasource._select_best_with_multiple_criteria( txs, [(lambda x: self._is_tx_in_tx_list(x, self._custom_canonical_txs), max), (lambda x: self._calculate_effect_score( x, start, end, alt_allele, ref_allele, variant_type), min), (self._calculate_canonical_score, max), (self._get_appris_rank, min), (lambda x: len(x.get_seq()), max), (lambda x: x.get_transcript_id(), min)]) return best_effect_txs[0] def _choose_canonical_transcript(self, txs, variant_type, ref_allele, alt_allele, start, end): """Use the level tag to choose canonical transcript. Choose highest canonical score. The following order of preference is used: 0. membership in custom canonical transcript list 1. curation level 2. appris rank 3. most detrimental effect 4. longest protein change 5. lexicographical on transcript ID :param list txs: list of Transcript :param str variant_type: :param str ref_allele: :param str alt_allele: :param str start: :param str end: :return Transcript: """ if len(txs) == 0: return None highest_scoring_tx = EnsemblTranscriptDatasource._select_best_with_multiple_criteria( txs, [(lambda x: self._is_tx_in_tx_list(x, self._custom_canonical_txs), max), (self._calculate_canonical_score, max), (self._get_appris_rank, min), (lambda x: self._calculate_effect_score( x, start, end, alt_allele, ref_allele, variant_type), min), (lambda x: len(x.get_seq()), max), (lambda x: x.get_transcript_id(), min)]) return highest_scoring_tx[0] def get_overlapping_transcripts(self, chr, start, end, padding=0): new_start = str(int(start) - padding) new_end = str(int(end) + padding) records = self._get_binned_transcripts(chr, new_start, new_end) return self._get_overlapping_transcript_records( records, new_start, new_end) def get_overlapping_genes(self, chr, start, end): txs = self.get_overlapping_transcripts(chr, start, end) txs = self._filter_transcripts(txs) return set([tx.get_gene() for tx in txs]) def _get_binned_transcripts_given_index(self, chr, start, end, index_dict): bins = region2bins(int(start), int(end)) records = list() for b in bins: key = chr + "_" + str(b) try: txs = index_dict[key] records.extend(txs) except KeyError: pass return set(records) def _get_binned_genes(self, chr, start, end): return self._get_binned_transcripts_given_index( chr, start, end, self.gene_db) def _get_binned_transcripts(self, chr, start, end): return self._get_binned_transcripts_given_index( chr, start, end, self.gp_bin_db) def _get_overlapping_transcript_records(self, records, start, end): return [ r for r in records if TranscriptProviderUtils.test_overlap( int(start), int(end), r.get_start(), r.get_end()) ] def _get_nearest_genes(self, chr, start, end): size_extensions = [1000, 10000, 100000, 1000000] left_gene, left_dist = None, None for s in size_extensions: new_start = start - s if new_start < 0: new_start = 1 txs = self.get_transcripts_by_pos(chr, new_start, end) nearest_gene_border = 0 for tx in txs: if tx.get_strand() == "-": highest_genome_position = tx.determine_transcript_start() else: highest_genome_position = tx.determine_transcript_stop() if highest_genome_position > nearest_gene_border: nearest_gene_border = highest_genome_position nearest_gene = tx.get_gene() if nearest_gene_border: left_dist = start - nearest_gene_border left_gene = nearest_gene break right_gene, right_dist = None, None for s in size_extensions: new_end = end + s txs = self.get_transcripts_by_pos(chr, start, new_end) nearest_gene_border = int(1e9) for tx in txs: if tx.get_strand() == "-": lowest_genome_position = tx.determine_transcript_stop() else: lowest_genome_position = tx.determine_transcript_start() if lowest_genome_position < nearest_gene_border: nearest_gene_border = lowest_genome_position nearest_gene = tx.get_gene() if nearest_gene_border < int(1e9): right_dist = nearest_gene_border - end right_gene = nearest_gene break return ((str(left_gene), str(left_dist)), (str(right_gene), str(right_dist))) def _render_other_transcripts(self, txs, transcriptIndicesToSkip, variant_type, ref_allele, alt_allele, start, end): """ Create a list of transcripts that are not being chosen. Other transcripts are formatted <gene>_<transcript_id>_<variant_classification>_<protein_change> Note: There are other areas of Oncotator (e.g. Generic_GeneProteinPositionDatasource) that depend on this format. Changing it here may introduce bugs in other pieces of code. Also, do not include any transcript that would render as IGR. txs -- a list of transcripts to render. transcriptIndicesToSkip -- a list of transcripts that are being used (i.e. not an "other transcript"). This will usually be the canonical or any transcript chosen by tx_mode. """ vcer = VariantClassifier() other_transcripts = list() for i, ot in enumerate(txs): if i not in transcriptIndicesToSkip: vc = vcer.variant_classify(tx=ot, variant_type=variant_type, ref_allele=ref_allele, alt_allele=alt_allele, start=start, end=end) if vc.get_vc() == VariantClassification.IGR: continue o = '_'.join([ ot.get_gene(), ot.get_transcript_id(), vc.get_vc(), vcer.generate_protein_change_from_vc(vc) ]) o = o.strip('_') other_transcripts.append(o) return '|'.join(other_transcripts) def retrieve_transcripts_by_gene(self, gene): """ Given a gene, return all of the transcripts (filtered) tagged with the gene. :param str gene: :rtype : list """ txs_unfiltered = self.gene_db.get(gene, None) txs = self._filter_transcripts(txs_unfiltered) if txs is None: return [] return txs def retrieveExons(self, gene, padding=10, isCodingOnly=False): """Return a list of (chr, start, end) tuples for each exon""" result = set() txs = self.gene_db.get(gene, None) if txs is None: return result txs = self._filter_transcripts(txs) for tx in txs: # If tx is coding if isCodingOnly and tx.get_gene_type() != "protein_coding": continue if isCodingOnly: exons = tx.get_cds() else: exons = tx.get_exons() for exon in exons: start = min(exon[0], exon[1]) end = max(exon[0], exon[1]) result.add((gene, tx.get_contig(), str(start - padding), str(end + padding))) return result def getTranscriptDict(self): return self.transcript_db def get_transcript(self, tx_id): if tx_id is None: return None return self.transcript_db.get(tx_id, None) def get_tx_mode(self): return self.tx_mode def _extract_segment_start_overlap(self, seg): """ Given a segment, return the gene and exons (e.g. 6+) overlapping the start of the segment. :param MutationData seg: :return tuple: [0]: start_exon start exon index (0-based) and whether it is all previous exons ("-") or downstream exons ("+") in the coding direction. [1]: start_gene -- gene symbol on the canonical transcript """ start_txs = self.get_transcripts_by_pos(chr=seg.chr, start=str(seg.start), end=str(seg.start)) if start_txs is None or len(start_txs) == 0: start_gene = "" start_exon = "" else: start_chosen_tx = self._choose_transcript( start_txs, self.get_tx_mode(), VariantClassification.VT_SNP, "", "", str(seg.start), str(seg.start)) result_tuple = self._determine_exons_affected_by_start( int(seg.start), start_chosen_tx) start_gene = start_chosen_tx.get_gene() start_exon = str(result_tuple[0]) + result_tuple[1] return start_exon, start_gene def _extract_segment_end_overlap(self, seg): pos = seg.end end_txs = self.get_transcripts_by_pos(chr=seg.chr, start=str(pos), end=str(pos)) if end_txs is None or len(end_txs) == 0: end_gene = "" end_exon = "" else: end_chosen_tx = self._choose_transcript( end_txs, self.get_tx_mode(), VariantClassification.VT_SNP, "", "", str(pos), str(pos)) result_tuple = self._determine_exons_affected_by_end( pos, end_chosen_tx) end_gene = end_chosen_tx.get_gene() end_exon = str(result_tuple[0]) + result_tuple[1] return end_exon, end_gene def annotate_segment(self, seg): """ Akin to annotate_mutation, but for segments. Generates the following annotations: genes -- a comma-separated list of the genes found in a given region. start_gene -- gene symbol overlapped by the segment start position end_gene -- gene symbol overlapped by the segment end position start_exon -- exon overlap for the start gene. Includes start exon index (0-based) and whether it is all previous exons ("-") or downstream exons ("+") in the coding direction. For example: 6+ sixth exon and on 6- sixth exon and previous Reminder that the exons are 0-based :returns MutationData seg: Annotated segment/region """ txs = self.get_transcripts_by_pos(seg.chr, seg.start, seg.end) genes = set(([tx.get_gene() for tx in txs])) genes_annotation_value = ",".join(sorted(list(genes))) seg.createAnnotation( "genes", genes_annotation_value, annotationSource=self.title, annotationDataType="String", annotationDescription="List of genes in the region.") # See if we can determine which gene and exon that is overlapped by start start_exon, start_gene = self._extract_segment_start_overlap(seg) seg.createAnnotation( "start_gene", start_gene, annotationSource=self.title, annotationDataType="String", annotationDescription="Gene overlapping start of the region.") seg.createAnnotation( "start_exon", start_exon, annotationSource=self.title, annotationDataType="String", annotationDescription= "Exon index (0-based) that overlaps start with '+' or '-'. '+' indicates all further exons of the gene are in the region. '-' indicates all previous exons of the gene are in the region" ) end_exon, end_gene = self._extract_segment_end_overlap(seg) seg.createAnnotation( "end_gene", end_gene, annotationSource=self.title, annotationDataType="String", annotationDescription="Gene overlapping end of the region.") seg.createAnnotation( "end_exon", end_exon, annotationSource=self.title, annotationDataType="String", annotationDescription= "Exon index (0-based) that overlaps end with '+' or '-'. '+' indicates all further exons of the gene are in the region. '-' indicates all previous exons of the gene are in the region" ) return seg def _extract_exon_info(self, position, tx): """ Create basic information about the given position relative to the transcript. :param int position: in genomic space :param Transcript tx: :return tuple: [0]: closest exon index of the position (0-based), [1]: whether the distance was left in genomic space (false for overlap) [2]: whether the position overlaps an exon """ exon_index = TranscriptProviderUtils.determine_closest_exon( tx, position, position) if exon_index is None: return exon_index, None, None, None left_distance, right_distance = TranscriptProviderUtils.determine_closest_distance_from_exon( position, position, exon_index, tx) is_in_exon = (left_distance <= 0) and (right_distance >= 0) is_diff_is_positive = (left_distance > 0) and (right_distance > 0) is_negative_strand = (tx.get_strand() == "-") return exon_index, is_diff_is_positive, is_in_exon, is_negative_strand def _determine_exons_affected_by_start(self, start, tx): """ Return the exons affected by start position for the given transcript. The exon returned is always affected. Description (position is ___ of the nearest exon is ) l_diff vs r_diff both are seg tx strand result "to the right" ldiff < rdiff negative start "-" < exon_i "to the left" ldiff < rdiff positive start "-" <= exon_i "to the right" ldiff < rdiff negative end "-" >= exon_i "to the left" ldiff < rdiff positive end "-" > exon_i "to the right" ldiff < rdiff negative start "+" > exon_i "to the left" ldiff < rdiff positive start "+" >= exon_i "to the right" ldiff < rdiff negative end "+" <= exon_i "to the left" ldiff < rdiff positive end "+" < exon_i # When in an exon, include that exon as being affected "in an exon" ldiff is negative, rdiff positive both start "-" <= exon_i "in an exon" ldiff is negative, rdiff positive both end "-" >= exon_i "in an exon" ldiff is negative, rdiff positive both start "+" >= exon_i "in an exon" ldiff is negative, rdiff positive both end "+" <= exon_i :param int start: start position in genomic space :param Transcript tx: transcript affected by start and end :return tuple: Tuple of (exon_id, {"+","-"}) where the second is whether all higher number exons ("+") or lower number exons ("-") Please note that it is possible to get a result that should be interpreted as "no exon affected", such as (-1, "-") All "no exon affected" will have a [0] of -1. """ exon_index, is_diff_is_positive, is_in_exon, is_negative_strand = self._extract_exon_info( int(start), tx) if exon_index is None: return None result_list = [-2, "X"] if is_in_exon and is_negative_strand: result_list = [exon_index, "-"] if is_in_exon and not is_negative_strand: result_list = [exon_index, "+"] # The rest are not in an exon if not is_in_exon: if is_negative_strand and is_diff_is_positive: result_list = [exon_index, "-"] if is_negative_strand and not is_diff_is_positive: result_list = [exon_index - 1, "-"] if not is_negative_strand and is_diff_is_positive: result_list = [exon_index, "+"] if not is_negative_strand and not is_diff_is_positive: result_list = [exon_index + 1, "+"] if (result_list[0] < 0 and result_list[1] == "-") or (result_list[0] >= len(tx.get_exons()) and result_list[1] == "+"): # This gene is unaffected by the start position result_list[0] = -1 return tuple(result_list) def _determine_exons_affected_by_end(self, end, tx): """ Table of calculation is in the docs for _determine_exons_affected_by_start :param int end: :param Transcript tx: :return tuple: Tuple of (exon_id, {"+","-"}) where the second is whether all higher number exons ("+") or lower number exons ("-") Please note that it is possible to get a result that should be interpreted as "no exon affected", such as (-1, "-") All "no exon affected" will have a [0] of -1. """ exon_index, is_diff_is_positive, is_in_exon, is_negative_strand = self._extract_exon_info( int(end), tx) if exon_index is None: return None result_list = [-2, "X"] if is_in_exon and is_negative_strand: result_list = [exon_index, "+"] if is_in_exon and not is_negative_strand: result_list = [exon_index, "-"] # The rest are not in an exon if not is_in_exon: if is_negative_strand and is_diff_is_positive: result_list = [exon_index + 1, "+"] if is_negative_strand and not is_diff_is_positive: result_list = [exon_index, "+"] if not is_negative_strand and is_diff_is_positive: result_list = [exon_index - 1, "-"] if not is_negative_strand and not is_diff_is_positive: result_list = [exon_index, "-"] if (result_list[0] < 0 and result_list[1] == "-") or (result_list[0] >= len(tx.get_exons()) and (result_list[1] == "+")): # This gene is unaffected by the end position result_list[0] = -1 return tuple(result_list) def get_gene_symbols(self): """Return all of the gene symbols recognized by this datasource. """ return self.gene_db.keys() def set_custom_canonical_txs(self, tx_list): """ :param tx_list: list of transcript IDs, with or without version numbers. :return: """ self._custom_canonical_txs = tx_list def get_custom_canonical_txs(self): return self._custom_canonical_txs def get_hashcode(self): """ Since this class can change annotation values depending on certain state attributes (e.g. tx-mode), we need the hashcode to change. The super class hashcode attribute is treated like an initial hashcode here. In other words, hashcode is not a simple attribute for this datasource class. :return: hashcode including state information """ hasher = Hasher() attrs_relevant_for_caching = [ self.hashcode, self.get_tx_mode(), str(self._custom_canonical_txs) ] for attr in attrs_relevant_for_caching: hasher.update(attr) return Hasher.md5_hash(hasher.hexdigest())
class EnsemblTranscriptDatasource(TranscriptProvider, Datasource, SegmentDatasource): """ Similar to a GAF datasource, but uses ensembl transcripts. Also, supports gencode Though all transcripts for GENCODE can be loaded, it is currently set to ignore any transcripts that are not "basic" """ """This is the list of annotations that get populated by this datasource""" POPULATED_ANNOTATION_NAMES = set(['transcript_exon', 'variant_type', 'variant_classification', 'other_transcripts', 'gene', 'gene_id', 'annotation_transcript', 'genome_change', 'strand', 'transcript_id', 'secondary_variant_classification', 'protein_change', 'codon_change', 'transcript_change', 'transcript_strand', 'gene', 'gene_type', 'gencode_transcript_tags', 'gencode_transcript_status', 'havana_transcript', 'ccds_id', 'gencode_transcript_type', 'transcript_position', 'gencode_transcript_name']) def __init__(self, src_file, title='ENSEMBL', version='', tx_mode=TranscriptProvider.TX_MODE_CANONICAL, protocol="file", is_thread_safe=False, tx_filter="dummy"): super(EnsemblTranscriptDatasource, self).__init__(src_file=src_file, title=title, version=version) ensembl_index_fname = src_file + ".transcript.idx" ensembl_gene_to_transcript_index_fname = src_file + ".transcript_by_gene.idx" ensembl_genomic_position_bins_to_transcript_index_fname = src_file + ".transcript_by_gp_bin.idx" # Seconds before a cache entry should be cleared out timeout = 1000 max_entries = 25000 cache_protocol = "memory" if not is_thread_safe: logging.getLogger(__name__).info("%s %s is being set up in faster, NOT thread-safe mode (for annotation). " % (title, version)) cache_protocol = "simple" # Contains a key of transcript id and value of a Transcript class, with sequence data where possible. # By specifying "memory" for the cache, this is thread safe. Otherwise, use "simple" self.transcript_db = shove.Shove(protocol + '://%s' % ensembl_index_fname, cache_protocol + "://", timeout=timeout, max_entries=max_entries) self.gene_db = shove.Shove(protocol + '://%s' % ensembl_gene_to_transcript_index_fname, cache_protocol + "://", timeout=timeout, max_entries=max_entries) self.gp_bin_db = shove.Shove(protocol + '://%s' % ensembl_genomic_position_bins_to_transcript_index_fname, cache_protocol + "://", timeout=timeout, max_entries=max_entries) tmp = self.gp_bin_db.keys() logging.getLogger(__name__).info("%s %s is being set up with default tx-mode: %s. " % (title, version, tx_mode)) self.set_tx_mode(tx_mode) logging.getLogger(__name__).info("%s %s is being set up with %s filtering. " % (title, version, tx_filter)) self._tx_filter = TranscriptFilterFactory.create_instance(tx_filter) self._hgvs_xformer = HgvsChangeTransformer() def set_tx_mode(self, tx_mode): if tx_mode == TranscriptProvider.TX_MODE_CANONICAL: logging.getLogger(__name__).warn("Attempting to set transcript mode of CANONICAL for ensembl. This operation is only supported for GENCODE. Otherwise, will be the same as EFFECT.") self.tx_mode = tx_mode def _create_basic_annotation(self, value): return Annotation(value=value, datasourceName=self.title) def _create_blank_set_of_annotations(self): final_annotation_dict = dict() for k in EnsemblTranscriptDatasource.POPULATED_ANNOTATION_NAMES: final_annotation_dict[k] = self._create_basic_annotation('') return final_annotation_dict def _retrieve_gencode_tag_value(self, tx, attribute_name): """ If transcript is not gencode, no error is thrown. Just a blank value. Note that gencode have other attributes, but not plain ol' ENSEMBL :param tx: :param attribute_name: :return: "" if other attributes are not present. "" if specified tag is not present. Otherwise, tag value. """ attribute_dict = tx.get_other_attributes() if attribute_dict is None: return "" return str(attribute_dict.get(attribute_name, "")) def get_transcripts_by_pos(self, chr, start, end): txs_unfiltered = self.get_overlapping_transcripts(chr, start, end) txs = self._filter_transcripts(txs_unfiltered) return txs def _create_hgvs_dict(self, chosen_tx, mutation): hgvs_dict = dict() if self._hgvs_xformer is not None: hgvs_dict = self._hgvs_xformer.hgvs_annotate_mutation_given_tx(mutation, chosen_tx) return hgvs_dict def _create_hgvs_annotation_dict(self, mutation, chosen_tx): hgvs_dict_keys = HgvsChangeTransformer.HEADERS hgvs_dict = self._create_hgvs_dict(chosen_tx, mutation) hgvs_dict_annotations = dict() for k in hgvs_dict_keys: hgvs_dict_annotations[k] = self._create_basic_annotation(hgvs_dict.get(k, "")) return hgvs_dict_annotations def annotate_mutation(self, mutation): chr = mutation.chr start = int(mutation.start) end = int(mutation.end) txs = self.get_transcripts_by_pos(chr, start, end) final_annotation_dict = self._create_blank_set_of_annotations() final_annotation_dict['variant_type'] = Annotation(value=TranscriptProviderUtils.infer_variant_type(mutation.ref_allele, mutation.alt_allele), datasourceName=self.title) chosen_tx = None # We have hit IGR if no transcripts come back. Most annotations can just use the blank set. if len(txs) == 0: final_annotation_dict['variant_classification'] = self._create_basic_annotation(VariantClassification.IGR) nearest_genes = self._get_nearest_genes(chr, int(start), int(end)) final_annotation_dict['other_transcripts'] = self._create_basic_annotation(value='%s (%s upstream) : %s (%s downstream)' % (nearest_genes[0][0], nearest_genes[0][1], nearest_genes[1][0], nearest_genes[1][1])) final_annotation_dict['gene'] = self._create_basic_annotation('Unknown') final_annotation_dict['gene_id'] = self._create_basic_annotation('0') final_annotation_dict['genome_change'] = self._create_basic_annotation(TranscriptProviderUtils.determine_genome_change(mutation.chr, mutation.start, mutation.end, mutation.ref_allele, mutation.alt_allele, final_annotation_dict['variant_type'].value)) else: # Choose the best effect transcript chosen_tx = self._choose_transcript(txs, self.get_tx_mode(), final_annotation_dict['variant_type'].value, mutation.ref_allele, mutation.alt_allele, start, end) vcer = VariantClassifier() final_annotation_dict['annotation_transcript'] = self._create_basic_annotation(chosen_tx.get_transcript_id()) final_annotation_dict['genome_change'] = self._create_basic_annotation(TranscriptProviderUtils.determine_genome_change(mutation.chr, mutation.start, mutation.end, mutation.ref_allele, mutation.alt_allele, final_annotation_dict['variant_type'].value)) final_annotation_dict['strand'] = self._create_basic_annotation(chosen_tx.get_strand()) final_annotation_dict['transcript_position'] = self._create_basic_annotation(TranscriptProviderUtils.render_transcript_position(int(start), int(end), chosen_tx)) final_annotation_dict['transcript_id'] = self._create_basic_annotation(chosen_tx.get_transcript_id()) variant_classfication = vcer.variant_classify(tx=chosen_tx, variant_type=final_annotation_dict['variant_type'].value, ref_allele=mutation.ref_allele, alt_allele=mutation.alt_allele, start=mutation.start, end=mutation.end) final_annotation_dict['transcript_exon'] = self._create_basic_annotation(str(variant_classfication.get_exon_i()+1)) final_annotation_dict['variant_classification'] = self._create_basic_annotation(variant_classfication.get_vc()) final_annotation_dict['secondary_variant_classification'] = self._create_basic_annotation(variant_classfication.get_secondary_vc()) final_annotation_dict['protein_change'] = self._create_basic_annotation(vcer.generate_protein_change_from_vc(variant_classfication)) final_annotation_dict['codon_change'] = self._create_basic_annotation(vcer.generate_codon_change_from_vc(chosen_tx, start, end, variant_classfication)) final_annotation_dict['transcript_change'] = self._create_basic_annotation(vcer.generate_transcript_change_from_tx(chosen_tx, final_annotation_dict['variant_type'].value, variant_classfication, start, end, mutation.ref_allele, mutation.alt_allele)) final_annotation_dict['transcript_strand'] = self._create_basic_annotation(chosen_tx.get_strand()) final_annotation_dict['gene'] = self._create_basic_annotation(chosen_tx.get_gene()) final_annotation_dict['gene_type'] = self._create_basic_annotation(chosen_tx.get_gene_type()) final_annotation_dict['gencode_transcript_tags'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'tag')) final_annotation_dict['gencode_transcript_status'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'transcript_status')) final_annotation_dict['havana_transcript'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'havana_transcript')) final_annotation_dict['ccds_id'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'ccdsid')) final_annotation_dict['gencode_transcript_type'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'transcript_type')) final_annotation_dict['gencode_transcript_name'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'transcript_name')) other_transcript_value = self._render_other_transcripts(txs, [txs.index(chosen_tx)], final_annotation_dict['variant_type'].value, mutation.ref_allele, mutation.alt_allele, mutation.start, mutation.end) final_annotation_dict['other_transcripts'] = self._create_basic_annotation(other_transcript_value) # final_annotation_dict['gene_id'].value mutation.addAnnotations(final_annotation_dict) # Add the HGVS annotations ... setting to "" if not available. hgvs_dict_annotations = self._create_hgvs_annotation_dict(mutation, chosen_tx) mutation.addAnnotations(hgvs_dict_annotations) return mutation def _filter_transcripts(self, txs): return self._tx_filter.filter(txs) def _choose_transcript(self, txs, tx_mode, variant_type, ref_allele, alt_allele, start, end): """Given a list of transcripts and a transcript mode (e.g. CANONICAL), choose the transcript to use. :param list txs: a list of transcripts that presumably overlap the variant :param tx_mode: :param str variant_type: :param str ref_allele: :param str alt_allele: :param start: :param end: :return Transcript : chosen transcript given tx-mode """ if len(txs) == 1: return txs[0] if tx_mode == TranscriptProvider.TX_MODE_CANONICAL: return self._choose_canonical_transcript(txs, variant_type, ref_allele, alt_allele, start, end) return self._choose_best_effect_transcript(txs, variant_type, ref_allele, alt_allele, start, end) def _choose_best_effect_transcript(self, txs, variant_type, ref_allele, alt_allele, start, end): """Choose the transcript with the most detrimental effect. The rankings are in TranscriptProviderUtils. Ties are broken by which transcript has the longer coding length. :param list txs: list of Transcript :param str variant_type: :param str ref_allele: :param str alt_allele: :param str start: :param str end: :return Transcript: """ vcer = VariantClassifier() effect_dict = TranscriptProviderUtils.retrieve_effect_dict() best_effect_score = 100000000 # lower score is more likely to get picked best_effect_tx = None for tx in txs: if (ref_allele == "" or ref_allele == "-") and (alt_allele == "" or alt_allele == "-"): vc = VariantClassification.SILENT else: vc = vcer.variant_classify(tx, ref_allele, alt_allele, start, end, variant_type).get_vc() effect_score = effect_dict.get(vc, 25) if effect_score < best_effect_score: best_effect_score = effect_score best_effect_tx = tx elif (effect_score == best_effect_score) and (len(best_effect_tx.get_seq()) < len(tx.get_seq())): best_effect_score = effect_score best_effect_tx = tx return best_effect_tx def _choose_canonical_transcript(self, txs, variant_type, ref_allele, alt_allele, start, end): """Use the level tag to choose canonical transcript. Choose highest canonical score. Ties are broken by whichever transcript has the most detrimental effect. """ if len(txs) == 0: return None scores = dict() for tx in txs: score = self._calculate_canonical_score(tx) if score not in scores.keys(): scores[score] = set() scores[score].add(tx) highest_score = max(scores.keys()) highest_scoring_txs = scores[highest_score] if len(highest_scoring_txs) == 1: return list(highest_scoring_txs)[0] else: return self._choose_best_effect_transcript(highest_scoring_txs, variant_type, ref_allele, alt_allele, start, end) def _calculate_canonical_score(self, tx): """ Level 1 is validated Level 2 is manual annotation Level 3 is automated annotation. :param tx: Transcript :return: """ # higher ranks are more important. lvl_rank = 0 lvl = tx.get_other_attributes().get('level', [None])[0] if lvl is None: lvl_score = 0 else: lvl_score = 4 - int(lvl) type_rank = 2 type_score = 0 if tx.get_gene_type() == "protein_coding": type_score = 1 return (lvl_score << lvl_rank) + (type_score << type_rank) def get_overlapping_transcripts(self, chr, start, end, padding=0): new_start = str(int(start) - padding) new_end = str(int(end) + padding) records = self._get_binned_transcripts(chr, new_start, new_end) return self._get_overlapping_transcript_records(records, new_start, new_end) def get_overlapping_genes(self, chr, start, end): txs = self.get_overlapping_transcripts(chr, start, end) txs = self._filter_transcripts(txs) return set([tx.get_gene() for tx in txs]) def _get_binned_transcripts_given_index(self, chr, start, end, index_dict): bins = region2bins(int(start), int(end)) records = list() for b in bins: key = chr + "_" + str(b) try: txs = index_dict[key] records.extend(txs) except KeyError: pass return set(records) def _get_binned_genes(self, chr, start, end): return self._get_binned_transcripts_given_index(chr, start, end, self.gene_db) def _get_binned_transcripts(self, chr, start, end): return self._get_binned_transcripts_given_index(chr, start, end, self.gp_bin_db) def _get_overlapping_transcript_records(self, records, start, end): return [r for r in records if TranscriptProviderUtils.test_overlap(int(start), int(end), r.get_start(), r.get_end())] def _get_nearest_genes(self, chr, start, end): size_extensions = [1000, 10000, 100000, 1000000] left_gene, left_dist = None, None for s in size_extensions: new_start = start - s if new_start < 0: new_start = 1 txs = self.get_transcripts_by_pos(chr, new_start, end) nearest_gene_border = 0 for tx in txs: if tx.get_strand() == "-": highest_genome_position = tx.determine_transcript_start() else: highest_genome_position = tx.determine_transcript_stop() if highest_genome_position > nearest_gene_border: nearest_gene_border = highest_genome_position nearest_gene = tx.get_gene() if nearest_gene_border: left_dist = start - nearest_gene_border left_gene = nearest_gene break right_gene, right_dist = None, None for s in size_extensions: new_end = end + s txs = self.get_transcripts_by_pos(chr, start, new_end) nearest_gene_border = int(1e9) for tx in txs: if tx.get_strand() == "-": lowest_genome_position = tx.determine_transcript_stop() else: lowest_genome_position = tx.determine_transcript_start() if lowest_genome_position < nearest_gene_border: nearest_gene_border = lowest_genome_position nearest_gene = tx.get_gene() if nearest_gene_border < int(1e9): right_dist = nearest_gene_border - end right_gene = nearest_gene break return ((str(left_gene), str(left_dist)), (str(right_gene), str(right_dist))) def _render_other_transcripts(self, txs, transcriptIndicesToSkip, variant_type, ref_allele, alt_allele, start, end): """ Create a list of transcripts that are not being chosen. Other transcripts are formatted <gene>_<transcript_id>_<variant_classification>_<protein_change> Note: There are other areas of Oncotator (e.g. Generic_GeneProteinPositionDatasource) that depend on this format. Changing it here may introduce bugs in other pieces of code. Also, do not include any transcript that would render as IGR. txs -- a list of transcripts to render. transcriptIndicesToSkip -- a list of transcripts that are being used (i.e. not an "other transcript"). This will usually be the canonical or any transcript chosen by tx_mode. """ vcer = VariantClassifier() other_transcripts = list() for i, ot in enumerate(txs): if i not in transcriptIndicesToSkip: vc = vcer.variant_classify(tx=ot, variant_type=variant_type, ref_allele=ref_allele, alt_allele=alt_allele, start=start, end=end) if vc.get_vc() == VariantClassification.IGR: continue o = '_'.join([ot.get_gene(), ot.get_transcript_id(), vc.get_vc(), vcer.generate_protein_change_from_vc(vc)]) o = o.strip('_') other_transcripts.append(o) return '|'.join(other_transcripts) def retrieveExons(self, gene, padding=10, isCodingOnly=False): """Return a list of (chr, start, end) tuples for each exon""" result = set() txs = self.gene_db.get(gene, None) if txs is None: return result txs = self._filter_transcripts(txs) for tx in txs: # If tx is coding if isCodingOnly and tx.get_gene_type() != "protein_coding": continue if isCodingOnly: exons = tx.get_cds() else: exons = tx.get_exons() for exon in exons: start = min(exon[0], exon[1]) end = max(exon[0], exon[1]) result.add((gene, tx.get_contig(), str(start - padding), str(end + padding))) return result def getTranscriptDict(self): return self.transcript_db def get_transcript(self, tx_id): if tx_id is None: return None return self.transcript_db.get(tx_id, None) def get_tx_mode(self): return self.tx_mode def _extract_segment_start_overlap(self, seg): """ Given a segment, return the gene and exons (e.g. 6+) overlapping the start of the segment. :param MutationData seg: :return tuple: [0]: start_exon start exon index (0-based) and whether it is all previous exons ("-") or downstream exons ("+") in the coding direction. [1]: start_gene -- gene symbol on the canonical transcript """ start_txs = self.get_transcripts_by_pos(chr=seg.chr, start=str(seg.start), end=str(seg.start)) if start_txs is None or len(start_txs) == 0: start_gene = "" start_exon = "" else: start_chosen_tx = self._choose_transcript(start_txs, self.get_tx_mode(), VariantClassification.VT_SNP, "", "", str(seg.start), str(seg.start)) result_tuple = self._determine_exons_affected_by_start(int(seg.start), start_chosen_tx) start_gene = start_chosen_tx.get_gene() start_exon = str(result_tuple[0]) + result_tuple[1] return start_exon, start_gene def _extract_segment_end_overlap(self, seg): pos = seg.end end_txs = self.get_transcripts_by_pos(chr=seg.chr, start=str(pos), end=str(pos)) if end_txs is None or len(end_txs) == 0: end_gene = "" end_exon = "" else: end_chosen_tx = self._choose_transcript(end_txs, self.get_tx_mode(), VariantClassification.VT_SNP, "", "", str(pos), str(pos)) result_tuple = self._determine_exons_affected_by_end(pos, end_chosen_tx) end_gene = end_chosen_tx.get_gene() end_exon = str(result_tuple[0]) + result_tuple[1] return end_exon, end_gene def annotate_segment(self, seg): """ Akin to annotate_mutation, but for segments. Generates the following annotations: genes -- a comma-separated list of the genes found in a given region. start_gene -- gene symbol overlapped by the segment start position end_gene -- gene symbol overlapped by the segment end position start_exon -- exon overlap for the start gene. Includes start exon index (0-based) and whether it is all previous exons ("-") or downstream exons ("+") in the coding direction. For example: 6+ sixth exon and on 6- sixth exon and previous Reminder that the exons are 0-based :returns MutationData seg: Annotated segment/region """ txs = self.get_transcripts_by_pos(seg.chr, seg.start, seg.end) genes = set(([tx.get_gene() for tx in txs])) genes_annotation_value = ",".join(sorted(list(genes))) seg.createAnnotation("genes", genes_annotation_value, annotationSource=self.title, annotationDataType="String", annotationDescription="List of genes in the region.") # See if we can determine which gene and exon that is overlapped by start start_exon, start_gene = self._extract_segment_start_overlap(seg) seg.createAnnotation("start_gene", start_gene, annotationSource=self.title, annotationDataType="String", annotationDescription="Gene overlapping start of the region.") seg.createAnnotation("start_exon", start_exon, annotationSource=self.title, annotationDataType="String", annotationDescription="Exon index (0-based) that overlaps start with '+' or '-'. '+' indicates all further exons of the gene are in the region. '-' indicates all previous exons of the gene are in the region") end_exon, end_gene = self._extract_segment_end_overlap(seg) seg.createAnnotation("end_gene", end_gene, annotationSource=self.title, annotationDataType="String", annotationDescription="Gene overlapping end of the region.") seg.createAnnotation("end_exon", end_exon, annotationSource=self.title, annotationDataType="String", annotationDescription="Exon index (0-based) that overlaps end with '+' or '-'. '+' indicates all further exons of the gene are in the region. '-' indicates all previous exons of the gene are in the region") return seg def _extract_exon_info(self, position, tx): """ Create basic information about the given position relative to the transcript. :param int position: in genomic space :param Transcript tx: :return tuple: [0]: closest exon index of the position (0-based), [1]: whether the distance was left in genomic space (false for overlap) [2]: whether the position overlaps an exon """ exon_index = TranscriptProviderUtils.determine_closest_exon(tx, position, position) if exon_index is None: return exon_index, None, None, None left_distance, right_distance = TranscriptProviderUtils.determine_closest_distance_from_exon(position, position, exon_index, tx) is_in_exon = (left_distance <= 0) and (right_distance >= 0) is_diff_is_positive = (left_distance > 0) and (right_distance > 0) is_negative_strand = (tx.get_strand() == "-") return exon_index, is_diff_is_positive, is_in_exon, is_negative_strand def _determine_exons_affected_by_start(self, start, tx): """ Return the exons affected by start position for the given transcript. The exon returned is always affected. Description (position is ___ of the nearest exon is ) l_diff vs r_diff both are seg tx strand result "to the right" ldiff < rdiff negative start "-" < exon_i "to the left" ldiff < rdiff positive start "-" <= exon_i "to the right" ldiff < rdiff negative end "-" >= exon_i "to the left" ldiff < rdiff positive end "-" > exon_i "to the right" ldiff < rdiff negative start "+" > exon_i "to the left" ldiff < rdiff positive start "+" >= exon_i "to the right" ldiff < rdiff negative end "+" <= exon_i "to the left" ldiff < rdiff positive end "+" < exon_i # When in an exon, include that exon as being affected "in an exon" ldiff is negative, rdiff positive both start "-" <= exon_i "in an exon" ldiff is negative, rdiff positive both end "-" >= exon_i "in an exon" ldiff is negative, rdiff positive both start "+" >= exon_i "in an exon" ldiff is negative, rdiff positive both end "+" <= exon_i :param int start: start position in genomic space :param Transcript tx: transcript affected by start and end :return tuple: Tuple of (exon_id, {"+","-"}) where the second is whether all higher number exons ("+") or lower number exons ("-") Please note that it is possible to get a result that should be interpreted as "no exon affected", such as (-1, "-") All "no exon affected" will have a [0] of -1. """ exon_index, is_diff_is_positive, is_in_exon, is_negative_strand = self._extract_exon_info(int(start), tx) if exon_index is None: return None result_list = [-2, "X"] if is_in_exon and is_negative_strand: result_list = [exon_index, "-"] if is_in_exon and not is_negative_strand: result_list = [exon_index, "+"] # The rest are not in an exon if not is_in_exon: if is_negative_strand and is_diff_is_positive: result_list = [exon_index, "-"] if is_negative_strand and not is_diff_is_positive: result_list = [exon_index-1, "-"] if not is_negative_strand and is_diff_is_positive: result_list = [exon_index, "+"] if not is_negative_strand and not is_diff_is_positive: result_list = [exon_index + 1, "+"] if (result_list[0] < 0 and result_list[1] == "-") or (result_list[0] >= len(tx.get_exons()) and result_list[1] == "+"): # This gene is unaffected by the start position result_list[0] = -1 return tuple(result_list) def _determine_exons_affected_by_end(self, end, tx): """ Table of calculation is in the docs for _determine_exons_affected_by_start :param int end: :param Transcript tx: :return tuple: Tuple of (exon_id, {"+","-"}) where the second is whether all higher number exons ("+") or lower number exons ("-") Please note that it is possible to get a result that should be interpreted as "no exon affected", such as (-1, "-") All "no exon affected" will have a [0] of -1. """ exon_index, is_diff_is_positive, is_in_exon, is_negative_strand = self._extract_exon_info(int(end), tx) if exon_index is None: return None result_list = [-2, "X"] if is_in_exon and is_negative_strand: result_list = [exon_index, "+"] if is_in_exon and not is_negative_strand: result_list = [exon_index, "-"] # The rest are not in an exon if not is_in_exon: if is_negative_strand and is_diff_is_positive: result_list = [exon_index + 1, "+"] if is_negative_strand and not is_diff_is_positive: result_list = [exon_index, "+"] if not is_negative_strand and is_diff_is_positive: result_list = [exon_index - 1, "-"] if not is_negative_strand and not is_diff_is_positive: result_list = [exon_index, "-"] if (result_list[0] < 0 and result_list[1] == "-") or (result_list[0] >= len(tx.get_exons()) and (result_list[1] == "+")): # This gene is unaffected by the end position result_list[0] = -1 return tuple(result_list)
class EnsemblTranscriptDatasource(TranscriptProvider, Datasource): """ Similar to a GAF datasource, but uses ensembl transcripts. Also, supports gencode Though all transcripts for GENCODE can be loaded, it is currently set to ignore any transcripts that are not "basic" """ """This is the list of annotations that get populated by this datasource""" POPULATED_ANNOTATION_NAMES = set(['transcript_exon', 'variant_type', 'variant_classification', 'other_transcripts', 'gene', 'gene_id', 'annotation_transcript', 'genome_change', 'strand', 'transcript_id', 'secondary_variant_classification', 'protein_change', 'codon_change', 'transcript_change', 'transcript_strand', 'gene', 'gene_type', 'gencode_transcript_tags', 'gencode_transcript_status', 'havana_transcript', 'ccds_id', 'gencode_transcript_type', 'transcript_position', 'gencode_transcript_name']) def __init__(self, src_file, title='ENSEMBL', version='', tx_mode=TranscriptProvider.TX_MODE_CANONICAL, protocol="file", is_thread_safe=False, tx_filter="dummy"): super(EnsemblTranscriptDatasource, self).__init__(src_file=src_file, title=title, version=version) ensembl_index_fname = src_file + ".transcript.idx" ensembl_gene_to_transcript_index_fname = src_file + ".transcript_by_gene.idx" ensembl_genomic_position_bins_to_transcript_index_fname = src_file + ".transcript_by_gp_bin.idx" # Seconds before a cache entry should be cleared out timeout = 1000 max_entries = 25000 cache_protocol = "memory" if not is_thread_safe: logging.getLogger(__name__).info("%s %s is being set up in faster, NOT thread-safe mode (for annotation). " % (title, version)) cache_protocol = "simple" # Contains a key of transcript id and value of a Transcript class, with sequence data where possible. # By specifying "memory" for the cache, this is thread safe. Otherwise, use "simple" self.transcript_db = shove.Shove(protocol + '://%s' % ensembl_index_fname, cache_protocol + "://", timeout=timeout, max_entries=max_entries) self.gene_db = shove.Shove(protocol + '://%s' % ensembl_gene_to_transcript_index_fname, cache_protocol + "://", timeout=timeout, max_entries=max_entries) self.gp_bin_db = shove.Shove(protocol + '://%s' % ensembl_genomic_position_bins_to_transcript_index_fname, cache_protocol + "://", timeout=timeout, max_entries=max_entries) tmp = self.gp_bin_db.keys() logging.getLogger(__name__).info("%s %s is being set up with default tx-mode: %s. " % (title, version, tx_mode)) self.set_tx_mode(tx_mode) logging.getLogger(__name__).info("%s %s is being set up with %s filtering. " % (title, version, tx_filter)) self._tx_filter = TranscriptFilterFactory.create_instance(tx_filter) self._hgvs_xformer = HgvsChangeTransformer() def set_tx_mode(self, tx_mode): if tx_mode == TranscriptProvider.TX_MODE_CANONICAL: logging.getLogger(__name__).warn("Attempting to set transcript mode of CANONICAL for ensembl. This operation is only supported for GENCODE. Otherwise, will be the same as EFFECT.") self.tx_mode = tx_mode def _create_basic_annotation(self, value): return Annotation(value=value, datasourceName=self.title) def _create_blank_set_of_annotations(self): final_annotation_dict = dict() for k in EnsemblTranscriptDatasource.POPULATED_ANNOTATION_NAMES: final_annotation_dict[k] = self._create_basic_annotation('') return final_annotation_dict def _retrieve_gencode_tag_value(self, tx, attribute_name): """ If transcript is not gencode, no error is thrown. Just a blank value. Note that gencode have other attributes, but not plain ol' ENSEMBL :param tx: :param attribute_name: :return: "" if other attributes are not present. "" if specified tag is not present. Otherwise, tag value. """ attribute_dict = tx.get_other_attributes() if attribute_dict is None: return "" return str(attribute_dict.get(attribute_name, "")) def get_transcripts_by_pos(self, chr, start, end): txs_unfiltered = self.get_overlapping_transcripts(chr, start, end) txs = self._filter_transcripts(txs_unfiltered) return txs def _create_hgvs_dict(self, chosen_tx, mutation): hgvs_dict = dict() if self._hgvs_xformer is not None: hgvs_dict = self._hgvs_xformer.hgvs_annotate_mutation_given_tx(mutation, chosen_tx) return hgvs_dict def _create_hgvs_annotation_dict(self, mutation, chosen_tx): hgvs_dict_keys = HgvsChangeTransformer.HEADERS hgvs_dict = self._create_hgvs_dict(chosen_tx, mutation) hgvs_dict_annotations = dict() for k in hgvs_dict_keys: hgvs_dict_annotations[k] = self._create_basic_annotation(hgvs_dict.get(k, "")) return hgvs_dict_annotations def annotate_mutation(self, mutation): chr = mutation.chr start = int(mutation.start) end = int(mutation.end) txs = self.get_transcripts_by_pos(chr, start, end) final_annotation_dict = self._create_blank_set_of_annotations() final_annotation_dict['variant_type'] = Annotation(value=TranscriptProviderUtils.infer_variant_type(mutation.ref_allele, mutation.alt_allele), datasourceName=self.title) chosen_tx = None # We have hit IGR if no transcripts come back. Most annotations can just use the blank set. if len(txs) == 0: final_annotation_dict['variant_classification'] = self._create_basic_annotation(VariantClassification.IGR) nearest_genes = self._get_nearest_genes(chr, int(start), int(end)) final_annotation_dict['other_transcripts'] = self._create_basic_annotation(value='%s (%s upstream) : %s (%s downstream)' % (nearest_genes[0][0], nearest_genes[0][1], nearest_genes[1][0], nearest_genes[1][1])) final_annotation_dict['gene'] = self._create_basic_annotation('Unknown') final_annotation_dict['gene_id'] = self._create_basic_annotation('0') final_annotation_dict['genome_change'] = self._create_basic_annotation(TranscriptProviderUtils.determine_genome_change(mutation.chr, mutation.start, mutation.end, mutation.ref_allele, mutation.alt_allele, final_annotation_dict['variant_type'].value)) else: # Choose the best effect transcript chosen_tx = self._choose_transcript(txs, self.get_tx_mode(), final_annotation_dict['variant_type'].value, mutation.ref_allele, mutation.alt_allele, start, end) vcer = VariantClassifier() final_annotation_dict['annotation_transcript'] = self._create_basic_annotation(chosen_tx.get_transcript_id()) final_annotation_dict['genome_change'] = self._create_basic_annotation(TranscriptProviderUtils.determine_genome_change(mutation.chr, mutation.start, mutation.end, mutation.ref_allele, mutation.alt_allele, final_annotation_dict['variant_type'].value)) final_annotation_dict['strand'] = self._create_basic_annotation(chosen_tx.get_strand()) final_annotation_dict['transcript_position'] = self._create_basic_annotation(TranscriptProviderUtils.render_transcript_position(int(start), int(end), chosen_tx)) final_annotation_dict['transcript_id'] = self._create_basic_annotation(chosen_tx.get_transcript_id()) variant_classfication = vcer.variant_classify(tx=chosen_tx, variant_type=final_annotation_dict['variant_type'].value, ref_allele=mutation.ref_allele, alt_allele=mutation.alt_allele, start=mutation.start, end=mutation.end) final_annotation_dict['transcript_exon'] = self._create_basic_annotation(str(variant_classfication.get_exon_i()+1)) final_annotation_dict['variant_classification'] = self._create_basic_annotation(variant_classfication.get_vc()) final_annotation_dict['secondary_variant_classification'] = self._create_basic_annotation(variant_classfication.get_secondary_vc()) final_annotation_dict['protein_change'] = self._create_basic_annotation(vcer.generate_protein_change_from_vc(variant_classfication)) final_annotation_dict['codon_change'] = self._create_basic_annotation(vcer.generate_codon_change_from_vc(chosen_tx, start, end, variant_classfication)) final_annotation_dict['transcript_change'] = self._create_basic_annotation(vcer.generate_transcript_change_from_tx(chosen_tx, final_annotation_dict['variant_type'].value, variant_classfication, start, end, mutation.ref_allele, mutation.alt_allele)) final_annotation_dict['transcript_strand'] = self._create_basic_annotation(chosen_tx.get_strand()) final_annotation_dict['gene'] = self._create_basic_annotation(chosen_tx.get_gene()) final_annotation_dict['gene_type'] = self._create_basic_annotation(chosen_tx.get_gene_type()) final_annotation_dict['gencode_transcript_tags'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'tag')) final_annotation_dict['gencode_transcript_status'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'transcript_status')) final_annotation_dict['havana_transcript'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'havana_transcript')) final_annotation_dict['ccds_id'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'ccdsid')) final_annotation_dict['gencode_transcript_type'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'transcript_type')) final_annotation_dict['gencode_transcript_name'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'transcript_name')) other_transcript_value = self._render_other_transcripts(txs, [txs.index(chosen_tx)], final_annotation_dict['variant_type'].value, mutation.ref_allele, mutation.alt_allele, mutation.start, mutation.end) final_annotation_dict['other_transcripts'] = self._create_basic_annotation(other_transcript_value) # final_annotation_dict['gene_id'].value mutation.addAnnotations(final_annotation_dict) # Add the HGVS annotations ... setting to "" if not available. hgvs_dict_annotations = self._create_hgvs_annotation_dict(mutation, chosen_tx) mutation.addAnnotations(hgvs_dict_annotations) return mutation def _filter_transcripts(self, txs): return self._tx_filter.filter(txs) def _choose_transcript(self, txs, tx_mode, variant_type, ref_allele, alt_allele, start, end): """Given a list of transcripts and a transcript mode (e.g. CANONICAL), choose the transcript to use. """ if len(txs) == 1: return txs[0] if tx_mode == TranscriptProvider.TX_MODE_CANONICAL: return self._choose_canonical_transcript(txs, variant_type, ref_allele, alt_allele, start, end) return self._choose_best_effect_transcript(txs, variant_type, ref_allele, alt_allele, start, end) def _choose_best_effect_transcript(self, txs, variant_type, ref_allele, alt_allele, start, end): """Choose the transcript with the most detrimental effect. The rankings are in TranscriptProviderUtils. Ties are broken by which transcript has the longer coding length. """ vcer = VariantClassifier() effect_dict = TranscriptProviderUtils.retrieve_effect_dict() best_effect_score = 100000000 # lower score is more likely to get picked best_effect_tx = None for tx in txs: vc = vcer.variant_classify(tx, ref_allele, alt_allele, start, end, variant_type).get_vc() effect_score = effect_dict.get(vc, 25) if effect_score < best_effect_score: best_effect_score = effect_score best_effect_tx = tx elif (effect_score == best_effect_score) and (len(best_effect_tx.get_seq()) < len(tx.get_seq())): best_effect_score = effect_score best_effect_tx = tx return best_effect_tx def _choose_canonical_transcript(self, txs, variant_type, ref_allele, alt_allele, start, end): """Use the level tag to choose canonical transcript. Choose highest canonical score. Ties are broken by whichever transcript has the most detrimental effect. """ if len(txs) == 0: return None scores = dict() for tx in txs: score = self._calculate_canonical_score(tx) if score not in scores.keys(): scores[score] = set() scores[score].add(tx) highest_score = max(scores.keys()) highest_scoring_txs = scores[highest_score] if len(highest_scoring_txs) == 1: return list(highest_scoring_txs)[0] else: return self._choose_best_effect_transcript(highest_scoring_txs, variant_type, ref_allele, alt_allele, start, end) def _calculate_canonical_score(self, tx): """ Level 1 is validated Level 2 is manual annotation Level 3 is automated annotation. :param tx: Transcript :return: """ # higher ranks are more important. lvl_rank = 0 lvl = tx.get_other_attributes().get('level', [None])[0] if lvl is None: lvl_score = 0 else: lvl_score = 4 - int(lvl) type_rank = 2 type_score = 0 if tx.get_gene_type() == "protein_coding": type_score = 1 return (lvl_score << lvl_rank) + (type_score << type_rank) def get_overlapping_transcripts(self, chr, start, end, padding=0): new_start = str(int(start) - padding) new_end = str(int(end) + padding) records = self._get_binned_transcripts(chr, new_start, new_end) return self._get_overlapping_transcript_records(records, new_start, new_end) def get_overlapping_genes(self, chr, start, end): txs = self.get_overlapping_transcripts(chr, start, end) txs = self._filter_transcripts(txs) return set([tx.get_gene() for tx in txs]) def _get_binned_transcripts_given_index(self, chr, start, end, index_dict): bins = region2bins(int(start), int(end)) records = list() for b in bins: key = chr + "_" + str(b) try: txs = index_dict[key] records.extend(txs) except KeyError: pass return set(records) def _get_binned_genes(self, chr, start, end): return self._get_binned_transcripts_given_index(chr, start, end, self.gene_db) def _get_binned_transcripts(self, chr, start, end): return self._get_binned_transcripts_given_index(chr, start, end, self.gp_bin_db) def _get_overlapping_transcript_records(self, records, start, end): return [r for r in records if TranscriptProviderUtils.test_overlap(int(start), int(end), r.get_start(), r.get_end())] def _get_nearest_genes(self, chr, start, end): size_extensions = [1000, 10000, 100000, 1000000] left_gene, left_dist = None, None for s in size_extensions: new_start = start - s if new_start < 0: new_start = 1 txs = self.get_transcripts_by_pos(chr, new_start, end) nearest_gene_border = 0 for tx in txs: if tx.get_strand() == "-": highest_genome_position = tx.determine_transcript_start() else: highest_genome_position = tx.determine_transcript_stop() if highest_genome_position > nearest_gene_border: nearest_gene_border = highest_genome_position nearest_gene = tx.get_gene() if nearest_gene_border: left_dist = start - nearest_gene_border left_gene = nearest_gene break right_gene, right_dist = None, None for s in size_extensions: new_end = end + s txs = self.get_transcripts_by_pos(chr, start, new_end) nearest_gene_border = int(1e9) for tx in txs: if tx.get_strand() == "-": lowest_genome_position = tx.determine_transcript_stop() else: lowest_genome_position = tx.determine_transcript_start() if lowest_genome_position < nearest_gene_border: nearest_gene_border = lowest_genome_position nearest_gene = tx.get_gene() if nearest_gene_border < int(1e9): right_dist = nearest_gene_border - end right_gene = nearest_gene break return ((str(left_gene), str(left_dist)), (str(right_gene), str(right_dist))) def _render_other_transcripts(self, txs, transcriptIndicesToSkip, variant_type, ref_allele, alt_allele, start, end): """ Create a list of transcripts that are not being chosen. Other transcripts are formatted <gene>_<transcript_id>_<variant_classification>_<protein_change> Note: There are other areas of Oncotator (e.g. Generic_GeneProteinPositionDatasource) that depend on this format. Changing it here may introduce bugs in other pieces of code. Also, do not include any transcript that would render as IGR. txs -- a list of transcripts to render. transcriptIndicesToSkip -- a list of transcripts that are being used (i.e. not an "other transcript"). This will usually be the canonical or any transcript chosen by tx_mode. """ vcer = VariantClassifier() other_transcripts = list() for i, ot in enumerate(txs): if i not in transcriptIndicesToSkip: vc = vcer.variant_classify(tx=ot, variant_type=variant_type, ref_allele=ref_allele, alt_allele=alt_allele, start=start, end=end) if vc.get_vc() == VariantClassification.IGR: continue o = '_'.join([ot.get_gene(), ot.get_transcript_id(), vc.get_vc(), vcer.generate_protein_change_from_vc(vc)]) o = o.strip('_') other_transcripts.append(o) return '|'.join(other_transcripts) def retrieveExons(self, gene, padding=10, isCodingOnly=False): """Return a list of (chr, start, end) tuples for each exon""" result = set() txs = self.gene_db.get(gene, None) if txs is None: return result txs = self._filter_transcripts(txs) for tx in txs: # If tx is coding if isCodingOnly and tx.get_gene_type() != "protein_coding": continue if isCodingOnly: exons = tx.get_cds() else: exons = tx.get_exons() for exon in exons: start = min(exon[0], exon[1]) end = max(exon[0], exon[1]) result.add((gene, tx.get_contig(), str(start - padding), str(end + padding))) return result def getTranscriptDict(self): return self.transcript_db def get_transcript(self, tx_id): if tx_id is None: return None return self.transcript_db.get(tx_id, None) def get_tx_mode(self): return self.tx_mode