def annotate_mutation(self, mutation): chr = mutation.chr start = int(mutation.start) end = int(mutation.end) txs = self.get_transcripts_by_pos(chr, start, end) final_annotation_dict = self._create_blank_set_of_annotations() final_annotation_dict['variant_type'] = Annotation(value=TranscriptProviderUtils.infer_variant_type(mutation.ref_allele, mutation.alt_allele), datasourceName=self.title) chosen_tx = None # We have hit IGR if no transcripts come back. Most annotations can just use the blank set. if len(txs) == 0: final_annotation_dict['variant_classification'] = self._create_basic_annotation(VariantClassification.IGR) nearest_genes = self._get_nearest_genes(chr, int(start), int(end)) final_annotation_dict['other_transcripts'] = self._create_basic_annotation(value='%s (%s upstream) : %s (%s downstream)' % (nearest_genes[0][0], nearest_genes[0][1], nearest_genes[1][0], nearest_genes[1][1])) final_annotation_dict['gene'] = self._create_basic_annotation('Unknown') final_annotation_dict['gene_id'] = self._create_basic_annotation('0') final_annotation_dict['genome_change'] = self._create_basic_annotation(TranscriptProviderUtils.determine_genome_change(mutation.chr, mutation.start, mutation.end, mutation.ref_allele, mutation.alt_allele, final_annotation_dict['variant_type'].value)) else: # Choose the best effect transcript chosen_tx = self._choose_transcript(txs, self.get_tx_mode(), final_annotation_dict['variant_type'].value, mutation.ref_allele, mutation.alt_allele, start, end) vcer = VariantClassifier() final_annotation_dict['annotation_transcript'] = self._create_basic_annotation(chosen_tx.get_transcript_id()) final_annotation_dict['genome_change'] = self._create_basic_annotation(TranscriptProviderUtils.determine_genome_change(mutation.chr, mutation.start, mutation.end, mutation.ref_allele, mutation.alt_allele, final_annotation_dict['variant_type'].value)) final_annotation_dict['strand'] = self._create_basic_annotation(chosen_tx.get_strand()) final_annotation_dict['transcript_position'] = self._create_basic_annotation(TranscriptProviderUtils.render_transcript_position(int(start), int(end), chosen_tx)) final_annotation_dict['transcript_id'] = self._create_basic_annotation(chosen_tx.get_transcript_id()) variant_classfication = vcer.variant_classify(tx=chosen_tx, variant_type=final_annotation_dict['variant_type'].value, ref_allele=mutation.ref_allele, alt_allele=mutation.alt_allele, start=mutation.start, end=mutation.end) final_annotation_dict['transcript_exon'] = self._create_basic_annotation(str(variant_classfication.get_exon_i()+1)) final_annotation_dict['variant_classification'] = self._create_basic_annotation(variant_classfication.get_vc()) final_annotation_dict['secondary_variant_classification'] = self._create_basic_annotation(variant_classfication.get_secondary_vc()) final_annotation_dict['protein_change'] = self._create_basic_annotation(vcer.generate_protein_change_from_vc(variant_classfication)) final_annotation_dict['codon_change'] = self._create_basic_annotation(vcer.generate_codon_change_from_vc(chosen_tx, start, end, variant_classfication)) final_annotation_dict['transcript_change'] = self._create_basic_annotation(vcer.generate_transcript_change_from_tx(chosen_tx, final_annotation_dict['variant_type'].value, variant_classfication, start, end, mutation.ref_allele, mutation.alt_allele)) final_annotation_dict['transcript_strand'] = self._create_basic_annotation(chosen_tx.get_strand()) final_annotation_dict['gene'] = self._create_basic_annotation(chosen_tx.get_gene()) final_annotation_dict['gene_type'] = self._create_basic_annotation(chosen_tx.get_gene_type()) final_annotation_dict['gencode_transcript_tags'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'tag')) final_annotation_dict['gencode_transcript_status'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'transcript_status')) final_annotation_dict['havana_transcript'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'havana_transcript')) final_annotation_dict['ccds_id'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'ccdsid')) final_annotation_dict['gencode_transcript_type'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'transcript_type')) final_annotation_dict['gencode_transcript_name'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'transcript_name')) other_transcript_value = self._render_other_transcripts(txs, [txs.index(chosen_tx)], final_annotation_dict['variant_type'].value, mutation.ref_allele, mutation.alt_allele, mutation.start, mutation.end) final_annotation_dict['other_transcripts'] = self._create_basic_annotation(other_transcript_value) # final_annotation_dict['gene_id'].value mutation.addAnnotations(final_annotation_dict) # Add the HGVS annotations ... setting to "" if not available. hgvs_dict_annotations = self._create_hgvs_annotation_dict(mutation, chosen_tx) mutation.addAnnotations(hgvs_dict_annotations) return mutation
def _render_other_transcripts(self, txs, transcriptIndicesToSkip, variant_type, ref_allele, alt_allele, start, end): """ Create a list of transcripts that are not being chosen. Other transcripts are formatted <gene>_<transcript_id>_<variant_classification>_<protein_change> Note: There are other areas of Oncotator (e.g. Generic_GeneProteinPositionDatasource) that depend on this format. Changing it here may introduce bugs in other pieces of code. Also, do not include any transcript that would render as IGR. txs -- a list of transcripts to render. transcriptIndicesToSkip -- a list of transcripts that are being used (i.e. not an "other transcript"). This will usually be the canonical or any transcript chosen by tx_mode. """ vcer = VariantClassifier() other_transcripts = list() for i, ot in enumerate(txs): if i not in transcriptIndicesToSkip: vc = vcer.variant_classify(tx=ot, variant_type=variant_type, ref_allele=ref_allele, alt_allele=alt_allele, start=start, end=end) if vc.get_vc() == VariantClassification.IGR: continue o = '_'.join([ot.get_gene(), ot.get_transcript_id(), vc.get_vc(), vcer.generate_protein_change_from_vc(vc)]) o = o.strip('_') other_transcripts.append(o) return '|'.join(other_transcripts)
def _choose_best_effect_transcript(self, txs, variant_type, ref_allele, alt_allele, start, end): """Choose the transcript with the most detrimental effect. The rankings are in TranscriptProviderUtils. Ties are broken by which transcript has the longer coding length. :param list txs: list of Transcript :param str variant_type: :param str ref_allele: :param str alt_allele: :param str start: :param str end: :return Transcript: """ vcer = VariantClassifier() effect_dict = TranscriptProviderUtils.retrieve_effect_dict() best_effect_score = 100000000 # lower score is more likely to get picked best_effect_tx = None for tx in txs: if (ref_allele == "" or ref_allele == "-") and (alt_allele == "" or alt_allele == "-"): vc = VariantClassification.SILENT else: vc = vcer.variant_classify(tx, ref_allele, alt_allele, start, end, variant_type).get_vc() effect_score = effect_dict.get(vc, 25) if effect_score < best_effect_score: best_effect_score = effect_score best_effect_tx = tx elif (effect_score == best_effect_score) and (len(best_effect_tx.get_seq()) < len(tx.get_seq())): best_effect_score = effect_score best_effect_tx = tx return best_effect_tx
def _calculate_effect_score(tx, start, end, alt_allele, ref_allele, variant_type): """Compute the effect score""" effect_dict = TranscriptProviderUtils.retrieve_effect_dict() vcer = VariantClassifier() if (ref_allele == "" or ref_allele == "-") and (alt_allele == "" or alt_allele == "-"): vc = VariantClassification.SILENT else: vc = vcer.variant_classify(tx, ref_allele, alt_allele, start, end, variant_type).get_vc() effect_score = effect_dict.get(vc, 25) return effect_score