def generate_transcript_change_from_tx(self, tx, variant_type, vc, start_genomic_space, end_genomic_space, ref_allele, alt_allele): """ :param vc: :return: """ if vc.get_vc() == VariantClassification.SPLICE_SITE and vc.get_secondary_vc() == VariantClassification.INTRON: return "" # dist_from_exon = self._get_splice_site_coordinates(tx, start_genomic_space, end_genomic_space, vc.get_exon_i()) # exon_i = vc.get_exon_i() # return TranscriptProviderUtils.render_splice_site_transcript_change(tx, dist_from_exon, exon_i, vc.get_secondary_vc() == VariantClassification.INTRON) if vc.get_cds_start_in_exon_space() == "" or vc.get_cds_start_in_exon_space() < 0: return "" exon_position_start,exon_position_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space(int(start_genomic_space), int(end_genomic_space), tx) if tx.get_strand() == "-": cds_position_start_cds_space = exon_position_start - int(vc.get_cds_start_in_exon_space())+1 cds_position_end_cds_space = exon_position_end - int(vc.get_cds_start_in_exon_space())+1 else: cds_position_start_cds_space = exon_position_start - int(vc.get_cds_start_in_exon_space()) cds_position_end_cds_space = exon_position_end - int(vc.get_cds_start_in_exon_space()) observed_allele_stranded, reference_allele_stranded = self._get_stranded_alleles(ref_allele, alt_allele, tx) result = TranscriptProviderUtils.render_transcript_change(variant_type, vc.get_vc(), cds_position_start_cds_space, cds_position_end_cds_space, reference_allele_stranded, observed_allele_stranded, vc.get_secondary_vc()) return result
def annotate_mutation(self, mutation): chr = mutation.chr start = int(mutation.start) end = int(mutation.end) txs = self.get_transcripts_by_pos(chr, start, end) final_annotation_dict = self._create_blank_set_of_annotations() final_annotation_dict['variant_type'] = Annotation(value=TranscriptProviderUtils.infer_variant_type(mutation.ref_allele, mutation.alt_allele), datasourceName=self.title) chosen_tx = None # We have hit IGR if no transcripts come back. Most annotations can just use the blank set. if len(txs) == 0: final_annotation_dict['variant_classification'] = self._create_basic_annotation(VariantClassification.IGR) nearest_genes = self._get_nearest_genes(chr, int(start), int(end)) final_annotation_dict['other_transcripts'] = self._create_basic_annotation(value='%s (%s upstream) : %s (%s downstream)' % (nearest_genes[0][0], nearest_genes[0][1], nearest_genes[1][0], nearest_genes[1][1])) final_annotation_dict['gene'] = self._create_basic_annotation('Unknown') final_annotation_dict['gene_id'] = self._create_basic_annotation('0') final_annotation_dict['genome_change'] = self._create_basic_annotation(TranscriptProviderUtils.determine_genome_change(mutation.chr, mutation.start, mutation.end, mutation.ref_allele, mutation.alt_allele, final_annotation_dict['variant_type'].value)) else: # Choose the best effect transcript chosen_tx = self._choose_transcript(txs, self.get_tx_mode(), final_annotation_dict['variant_type'].value, mutation.ref_allele, mutation.alt_allele, start, end) vcer = VariantClassifier() final_annotation_dict['annotation_transcript'] = self._create_basic_annotation(chosen_tx.get_transcript_id()) final_annotation_dict['genome_change'] = self._create_basic_annotation(TranscriptProviderUtils.determine_genome_change(mutation.chr, mutation.start, mutation.end, mutation.ref_allele, mutation.alt_allele, final_annotation_dict['variant_type'].value)) final_annotation_dict['strand'] = self._create_basic_annotation(chosen_tx.get_strand()) final_annotation_dict['transcript_position'] = self._create_basic_annotation(TranscriptProviderUtils.render_transcript_position(int(start), int(end), chosen_tx)) final_annotation_dict['transcript_id'] = self._create_basic_annotation(chosen_tx.get_transcript_id()) variant_classfication = vcer.variant_classify(tx=chosen_tx, variant_type=final_annotation_dict['variant_type'].value, ref_allele=mutation.ref_allele, alt_allele=mutation.alt_allele, start=mutation.start, end=mutation.end) final_annotation_dict['transcript_exon'] = self._create_basic_annotation(str(variant_classfication.get_exon_i()+1)) final_annotation_dict['variant_classification'] = self._create_basic_annotation(variant_classfication.get_vc()) final_annotation_dict['secondary_variant_classification'] = self._create_basic_annotation(variant_classfication.get_secondary_vc()) final_annotation_dict['protein_change'] = self._create_basic_annotation(vcer.generate_protein_change_from_vc(variant_classfication)) final_annotation_dict['codon_change'] = self._create_basic_annotation(vcer.generate_codon_change_from_vc(chosen_tx, start, end, variant_classfication)) final_annotation_dict['transcript_change'] = self._create_basic_annotation(vcer.generate_transcript_change_from_tx(chosen_tx, final_annotation_dict['variant_type'].value, variant_classfication, start, end, mutation.ref_allele, mutation.alt_allele)) final_annotation_dict['transcript_strand'] = self._create_basic_annotation(chosen_tx.get_strand()) final_annotation_dict['gene'] = self._create_basic_annotation(chosen_tx.get_gene()) final_annotation_dict['gene_type'] = self._create_basic_annotation(chosen_tx.get_gene_type()) final_annotation_dict['gencode_transcript_tags'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'tag')) final_annotation_dict['gencode_transcript_status'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'transcript_status')) final_annotation_dict['havana_transcript'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'havana_transcript')) final_annotation_dict['ccds_id'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'ccdsid')) final_annotation_dict['gencode_transcript_type'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'transcript_type')) final_annotation_dict['gencode_transcript_name'] = self._create_basic_annotation(self._retrieve_gencode_tag_value(chosen_tx, 'transcript_name')) other_transcript_value = self._render_other_transcripts(txs, [txs.index(chosen_tx)], final_annotation_dict['variant_type'].value, mutation.ref_allele, mutation.alt_allele, mutation.start, mutation.end) final_annotation_dict['other_transcripts'] = self._create_basic_annotation(other_transcript_value) # final_annotation_dict['gene_id'].value mutation.addAnnotations(final_annotation_dict) # Add the HGVS annotations ... setting to "" if not available. hgvs_dict_annotations = self._create_hgvs_annotation_dict(mutation, chosen_tx) mutation.addAnnotations(hgvs_dict_annotations) return mutation
def _add(self, mutation): variant_type = TranscriptProviderUtils.infer_variant_type(mutation.ref_allele, mutation.alt_allele) # only combine ONPs, not indels if not TranscriptProviderUtils.is_xnp(variant_type): self.indel_queue.append(mutation) else: self.queue[self.sns.getSampleName(mutation)].append(mutation)
def generate_codon_change_from_vc(self, t, start, end, vc): """ :param t: (Transcript) :param start: (int) :param end: (int) :param vc: (VariantClassification) :return: """ dist_from_exon = self._get_splice_site_coordinates(t, start, end, vc.get_exon_i()) exon_i = vc.get_exon_i() if vc.get_vc() == VariantClassification.SPLICE_SITE and vc.get_secondary_vc() == VariantClassification.INTRON: return TranscriptProviderUtils.render_intronic_splice_site_codon_change(dist_from_exon, exon_i) if vc.get_ref_codon_start_in_exon() == "" or vc.get_ref_codon_end_in_exon() == "": return "" codon_position_start_cds_space = int(vc.get_ref_codon_start_in_exon()) - int(vc.get_cds_start_in_exon_space())+1 codon_position_end_cds_space = int(vc.get_ref_codon_end_in_exon()) - int(vc.get_cds_start_in_exon_space())+1 ref_codon_seq = vc.get_ref_codon() alt_codon_seq = vc.get_alt_codon() result = TranscriptProviderUtils.render_codon_change(vc.get_vt(), vc.get_vc(), int(codon_position_start_cds_space), int(codon_position_end_cds_space), ref_codon_seq, alt_codon_seq, dist_from_exon, exon_i, vc.get_secondary_vc()) return result
def initializeMutFromAttributes(chr, start, end, ref_allele, alt_allele, build): mut = MutationData(str(chr), str(start), str(end), ref_allele, alt_allele, str(build)) varType = TranscriptProviderUtils.infer_variant_type(mut.ref_allele, mut.alt_allele) if TranscriptProviderUtils.is_xnp(varType): # Snps and other xNPs mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue="") if varType == VariantClassification.VT_DEL: # deletion preceding_bases, updated_ref_allele, updated_start, updated_end =\ MutUtils.retrievePrecedingBasesForDeletions(mut) mut.ref_allele = updated_ref_allele mut["ref_allele"] = updated_ref_allele mut.alt_allele = "-" mut["alt_allele"] = "-" mut.start = updated_start mut["start"] = updated_start mut.end = updated_end mut["end"] = updated_end mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) elif varType == VariantClassification.VT_INS: # insertion preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut["ref_allele"] = "-" mut.alt_allele = updated_alt_allele mut["alt_allele"] = updated_alt_allele mut.start = updated_start mut["start"] = updated_start mut.end = updated_end mut["end"] = updated_end mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) return mut
def initializeMutFromAttributes(chr, start, end, ref_allele, alt_allele, build, mutation_data_factory=None): mutation_data_factory = MutationDataFactory() if mutation_data_factory is None else mutation_data_factory mut = mutation_data_factory.create(str(chr), str(start), str(end), ref_allele, alt_allele, str(build)) varType = TranscriptProviderUtils.infer_variant_type(mut.ref_allele, mut.alt_allele) if TranscriptProviderUtils.is_xnp(varType): # Snps and other xNPs mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue="") if varType == VariantClassification.VT_DEL: # deletion preceding_bases, updated_ref_allele, updated_start, updated_end =\ MutUtils.retrievePrecedingBasesForDeletions(mut) mut.ref_allele = updated_ref_allele mut["ref_allele"] = updated_ref_allele mut.alt_allele = "-" mut["alt_allele"] = "-" mut.start = updated_start mut["start"] = updated_start mut.end = updated_end mut["end"] = updated_end mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) elif varType == VariantClassification.VT_INS: # insertion preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut["ref_allele"] = "-" mut.alt_allele = updated_alt_allele mut["alt_allele"] = updated_alt_allele mut.start = updated_start mut["start"] = updated_start mut.end = updated_end mut["end"] = updated_end mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) return mut
def _determine_codon_overlap(self, s, e, codon_tuple, variant_type): if codon_tuple is None: return False if variant_type == VariantClassification.VT_INS: is_codon_overlap = TranscriptProviderUtils.test_overlap(s, s, codon_tuple[0]+1, codon_tuple[1]) else: is_codon_overlap = TranscriptProviderUtils.test_overlap(s, e, codon_tuple[0]+1, codon_tuple[1]) return is_codon_overlap
def _add(self, mutation): variant_type = TranscriptProviderUtils.infer_variant_type( mutation.ref_allele, mutation.alt_allele) # only combine ONPs, not indels if not TranscriptProviderUtils.is_xnp(variant_type): self.indel_queue.append(mutation) else: self.queue[self.sns.getSampleName(mutation)].append(mutation)
def _determine_if_cds_overlap(self, s, e, tx, variant_type): if variant_type == VariantClassification.VT_INS: is_cds_overlap = TranscriptProviderUtils.test_feature_overlap( s, s, tx.get_cds()) != -1 else: is_cds_overlap = TranscriptProviderUtils.test_feature_overlap( s, e, tx.get_cds()) != -1 return is_cds_overlap
def _determine_codon_overlap(self, s, e, codon_tuple, variant_type): if codon_tuple is None: return False if variant_type == VariantClassification.VT_INS: is_codon_overlap = TranscriptProviderUtils.test_overlap( s, s, codon_tuple[0] + 1, codon_tuple[1]) else: is_codon_overlap = TranscriptProviderUtils.test_overlap( s, e, codon_tuple[0] + 1, codon_tuple[1]) return is_codon_overlap
def test_codon_single_base(self, start, end, ref_base_stranded, gt_codon): """Test that we can grab the proper three bases of a codon for an arbitrary single base """ tx = self.retrieve_test_transcript_MAPK1() transcript_position_start, transcript_position_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space(start, end, tx) cds_start, cds_stop = TranscriptProviderUtils.determine_cds_in_exon_space(tx) protein_position_start, protein_position_end = TranscriptProviderUtils.get_protein_positions(transcript_position_start, transcript_position_end, cds_start) cds_codon_start, cds_codon_end = TranscriptProviderUtils.get_cds_codon_positions(protein_position_start, protein_position_end, cds_start) codon_seq = tx.get_seq()[cds_codon_start:cds_codon_end+1] self.assertTrue(codon_seq == gt_codon, "Did not get correct codon (%s): %s loc: %s-%s" %(gt_codon, codon_seq, start, end))
def _determine_if_splice_site_overlap(self, start_genomic_space, end_genomic_space, tx, variant_type, dist=2): """ Overlap of start and stop codon (i.e. start of first exon and end of last exon -- stranded) will not be a Splice_Site. This method will return is_splice_site_overlap of False If overlap is detected, but the start or end is within dist bp, then this is a splice site. start <= end INS events only call splice site when they start in the splice site :param start_genomic_space: int in genomic space :param end_genomic_space: int in genomic space :param tx: Transcript :param variant_type: :param dist: :return is_splice_site_overlap, exon_i, is_right_overlap (Higher genomic position --> True) """ exons = tx.get_exons() strand = tx.get_strand() # If this is an insertion, we only want to count a splice site if it starts in the splice site regions if variant_type == VariantClassification.VT_INS: end_genomic_space = start_genomic_space for i, exon in enumerate(exons): is_internal_exon = (i > 0) and (i < (len(exons) - 1)) is_check_left = is_internal_exon or (strand == "-" and i == 0) or ( strand == "+" and i == (len(exons) - 1)) is_check_right = is_internal_exon or ( strand == "+" and i == 0) or (strand == "-" and i == (len(exons) - 1)) if is_check_left: splice_site_left = (exon[0] - dist + 1, exon[0] + (dist - 1) + 1) overlap_type_left = TranscriptProviderUtils.test_overlap( start_genomic_space, end_genomic_space, splice_site_left[0], splice_site_left[1]) if overlap_type_left: return True, i, False if is_check_right: splice_site_right = (exon[1] - (dist - 1), exon[1] + dist) overlap_type_right = TranscriptProviderUtils.test_overlap( start_genomic_space, end_genomic_space, splice_site_right[0], splice_site_right[1]) if overlap_type_right: return True, i, True return False, -1, None, False
def _determine_de_novo_old(self, vc, transcript_position_start, transcript_position_end, ref, alt, tx, variant_type): """Returns input vc if not de Novo. Otherwise, returns updated variant classification. :param vc: Current variant classification. Note that if this is not 5'UTR, this method will just return this input. :param transcript_position_start: :param transcript_position_end: :param ref: (str) Does not take into account strandedness (e.g. m.ref_allele) :param alt: (str) Does not take into account strandedness (e.g. m.alt_allele) :param tx: transcript :param variant_type: Will always return original vc if the vc is not None.""" result = vc if vc == VariantClassification.FIVE_PRIME_UTR and ref != alt: observed_allele_stranded = self._determine_stranded_allele( alt, tx.get_strand()) reference_allele_stranded = self._determine_stranded_allele( ref, tx.get_strand()) tx_seq = tx.get_seq() if variant_type == VariantClassification.VT_INS: if tx.get_strand() == "-": transcript_position_start = transcript_position_end else: transcript_position_end = transcript_position_start utr_region_start, utr_region_end = transcript_position_start - 2, transcript_position_end + 2 # TODO: This may not work for "+" strand. Need unit test. utr_region_seq = tx_seq[utr_region_start:utr_region_end + 1] mutated_utr_region_seq = TranscriptProviderUtils.mutate_reference_sequence( utr_region_seq, utr_region_start, transcript_position_start, transcript_position_end, observed_allele_stranded, variant_type) # Check for Denovo ATG_position = mutated_utr_region_seq.find('ATG') if ATG_position > -1: cds_start_in_exon_space, cds_end_in_exon_space = TranscriptProviderUtils.determine_cds_in_exon_space( tx) ATG_position = utr_region_start + ATG_position + 1 if (cds_start_in_exon_space - ATG_position) % 3 == 0: frameness = 'InFrame' else: frameness = 'OutOfFrame' result = 'De_novo_Start_' + frameness return result
def _determine_de_novo(self, vc_str, exon_start, ref, alt, tx, variant_type, buffer=2): """Returns input vc if not de Novo. Otherwise, returns updated variant classification. :param exon_start: :param buffer: :param vc_str: Current variant classification. Note that if this is not 5'UTR, this method will just return this input. :param ref: (str) Does not take into account strandedness (e.g. m.ref_allele) :param alt: (str) Does not take into account strandedness (e.g. m.alt_allele) :param tx: transcript :param variant_type: Will always return original vc if the vc is not None.""" result = vc_str if vc_str == VariantClassification.FIVE_PRIME_UTR and ref != alt: mutated_utr_region = self._mutate_exon(tx, ref, alt, variant_type, exon_start, buffer) atg_position = mutated_utr_region.find('ATG') if atg_position > -1: atg_exon_position = exon_start + atg_position - buffer cds_start_in_exon_space, cds_end_in_exon_space = TranscriptProviderUtils.determine_cds_in_exon_space( tx) if (cds_start_in_exon_space - atg_exon_position) % 3 == 0: frameness = 'InFrame' else: frameness = 'OutOfFrame' result = 'De_novo_Start_' + frameness return result
def _determine_de_novo(self, vc_str, exon_start, ref, alt, tx, variant_type, buffer=2 ): """Returns input vc if not de Novo. Otherwise, returns updated variant classification. :param exon_start: :param buffer: :param vc_str: Current variant classification. Note that if this is not 5'UTR, this method will just return this input. :param ref: (str) Does not take into account strandedness (e.g. m.ref_allele) :param alt: (str) Does not take into account strandedness (e.g. m.alt_allele) :param tx: transcript :param variant_type: Will always return original vc if the vc is not None.""" result = vc_str if vc_str == VariantClassification.FIVE_PRIME_UTR and ref != alt: mutated_utr_region = self._mutate_exon(tx, ref, alt, variant_type, exon_start, buffer) atg_position = mutated_utr_region.find('ATG') if atg_position > -1: atg_exon_position = exon_start + atg_position - buffer cds_start_in_exon_space, cds_end_in_exon_space = TranscriptProviderUtils.determine_cds_in_exon_space(tx) if (cds_start_in_exon_space - atg_exon_position) % 3 == 0: frameness = 'InFrame' else: frameness = 'OutOfFrame' result = 'De_novo_Start_' + frameness return result
def _is_matching(self, mut, tsv_record): chrom = tsv_record[self.tsv_index["chrom"]] startPos = tsv_record[self.tsv_index["start"]] endPos = tsv_record[self.tsv_index["end"]] build = "hg19" if self.match_mode == "exact": if "ref" in self.tsv_index and "alt" in self.tsv_index: # ref and alt information is present ref = tsv_record[self.tsv_index["ref"]] alt = tsv_record[self.tsv_index["alt"]] if ref == "-" or alt == "-": # addresses Mutation Annotation Format based tsv records # TODO: This looks risky to be calling the MutationData constructor directly ds_mut = MutationData(chrom, startPos, endPos, ref, alt, build) else: # addresses tsv records where the input isn't a Mutation Annotation Format file ds_mut = MutUtils.initializeMutFromAttributes(chrom, startPos, endPos, ref, alt, build) if mut.chr == ds_mut.chr and mut.ref_allele == ds_mut.ref_allele \ and mut.alt_allele == ds_mut.alt_allele and int(mut.start) == int(ds_mut.start) \ and int(mut.end) == int(ds_mut.end): return True else: # do not use ref and alt information if mut.chr == chrom and int(mut.start) == int(startPos) and int(mut.end) == int(endPos): return True else: return TranscriptProviderUtils.test_overlap(int(mut.start), int(mut.end), int(startPos), int(endPos)) return False
def _is_matching(self, mut, tsv_record): chrom = tsv_record[self.tsv_index["chrom"]] startPos = tsv_record[self.tsv_index["start"]] endPos = tsv_record[self.tsv_index["end"]] build = "hg19" if self.match_mode == "exact": if "ref" in self.tsv_index and "alt" in self.tsv_index: # ref and alt information is present ref = tsv_record[self.tsv_index["ref"]] alt = tsv_record[self.tsv_index["alt"]] if ref == "-" or alt == "-": # addresses Mutation Annotation Format based tsv records # TODO: This looks risky to be calling the MutationData constructor directly ds_mut = MutationData(chrom, startPos, endPos, ref, alt, build) else: # addresses tsv records where the input isn't a Mutation Annotation Format file ds_mut = MutUtils.initializeMutFromAttributes( chrom, startPos, endPos, ref, alt, build) if mut.chr == ds_mut.chr and mut.ref_allele == ds_mut.ref_allele \ and mut.alt_allele == ds_mut.alt_allele and int(mut.start) == int(ds_mut.start) \ and int(mut.end) == int(ds_mut.end): return True else: # do not use ref and alt information if mut.chr == chrom and int( mut.start) == int(startPos) and int( mut.end) == int(endPos): return True else: return TranscriptProviderUtils.test_overlap( int(mut.start), int(mut.end), int(startPos), int(endPos)) return False
def _choose_best_effect_transcript(self, txs, variant_type, ref_allele, alt_allele, start, end): """Choose the transcript with the most detrimental effect. The rankings are in TranscriptProviderUtils. Ties are broken by which transcript has the longer coding length. :param list txs: list of Transcript :param str variant_type: :param str ref_allele: :param str alt_allele: :param str start: :param str end: :return Transcript: """ vcer = VariantClassifier() effect_dict = TranscriptProviderUtils.retrieve_effect_dict() best_effect_score = 100000000 # lower score is more likely to get picked best_effect_tx = None for tx in txs: if (ref_allele == "" or ref_allele == "-") and (alt_allele == "" or alt_allele == "-"): vc = VariantClassification.SILENT else: vc = vcer.variant_classify(tx, ref_allele, alt_allele, start, end, variant_type).get_vc() effect_score = effect_dict.get(vc, 25) if effect_score < best_effect_score: best_effect_score = effect_score best_effect_tx = tx elif (effect_score == best_effect_score) and (len(best_effect_tx.get_seq()) < len(tx.get_seq())): best_effect_score = effect_score best_effect_tx = tx return best_effect_tx
def test_convert_genomic_space_to_transcript_space(self): base_config_location = "testdata/ensembl/saccer/" ensembl_ds = DatasourceFactory.createDatasource(base_config_location + "ensembl.config", base_config_location) tx = ensembl_ds.get_overlapping_transcripts("I", "350", "350") # transcript starts at 335. start, end = TranscriptProviderUtils.convert_genomic_space_to_transcript_space("350", "350", tx[0]) self.assertTrue(start == end) self.assertTrue(start == 16) tx = ensembl_ds.get_overlapping_transcripts("II", "764690", "764690") # transcript starts at 764697 (strand is '-'). start, end = TranscriptProviderUtils.convert_genomic_space_to_transcript_space("764690", "764690", tx[0]) self.assertTrue(start == end) self.assertTrue(start == 7) start, end = TranscriptProviderUtils.convert_genomic_space_to_transcript_space("764680", "764690", tx[0]) self.assertTrue(start == (end - 10)) self.assertTrue(start == 7)
def test_seq(self, start, end, gt): """Test that we can successfully determine the codon at an arbitrary location on test transcript""" tx = self.retrieve_test_transcript_MAPK1() transcript_position_start, transcript_position_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space(start, end, tx) transcript_seq = tx.get_seq() seq = transcript_seq[transcript_position_start:transcript_position_end+1] self.assertTrue(seq == gt, "Incorrect seq found guess,gt (%s, %s)" %(seq, gt))
def test_mutate_reference_seqeunce(self, vt, start, end, ref, alt, start_exon_space, end_exon_space, mutated_seq_gt): """ Test that we can render a mutated sequence with SNP, INS, and DEL """ # mutated_seq_gt is stranded and this is a "-" transcript tx = self.retrieve_test_transcript_MAPK1() observed_allele = Bio.Seq.reverse_complement(alt) mutated_allele = TranscriptProviderUtils.mutate_reference_sequence(tx.get_seq()[start_exon_space : end_exon_space+1], start_exon_space, start_exon_space, end_exon_space, observed_allele, vt) self.assertTrue(mutated_seq_gt == mutated_allele, "No match (gt/guess) %s/%s for %s." % (mutated_seq_gt, mutated_allele, str([vt, start, end, ref, alt, start_exon_space, end_exon_space, mutated_seq_gt])))
def _calculate_protein_sequence(self, exons, seq, cds_start_genomic_space, cds_stop_genomic_space, strand): cds_start_exon_space, cds_stop_exon_space = TranscriptProviderUtils._convert_genomic_space_to_feature_space(int(cds_start_genomic_space), int(cds_stop_genomic_space), exons, strand) prot_seq = MutUtils.translate_sequence(seq[int(cds_start_exon_space):int(cds_stop_exon_space)]) if len(prot_seq) > 0 and prot_seq[-1] == '*': prot_seq = prot_seq[:-1] return prot_seq
def _calculate_effect_score(tx, start, end, alt_allele, ref_allele, variant_type): """Compute the effect score""" effect_dict = TranscriptProviderUtils.retrieve_effect_dict() vcer = VariantClassifier() if (ref_allele == "" or ref_allele == "-") and (alt_allele == "" or alt_allele == "-"): vc = VariantClassification.SILENT else: vc = vcer.variant_classify(tx, ref_allele, alt_allele, start, end, variant_type).get_vc() effect_score = effect_dict.get(vc, 25) return effect_score
def _determine_if_splice_site_overlap(self, start_genomic_space, end_genomic_space, tx, variant_type, dist=2): """ Overlap of start and stop codon (i.e. start of first exon and end of last exon -- stranded) will not be a Splice_Site. This method will return is_splice_site_overlap of False If overlap is detected, but the start or end is within dist bp, then this is a splice site. start <= end INS events only call splice site when they start in the splice site :param start_genomic_space: int in genomic space :param end_genomic_space: int in genomic space :param tx: Transcript :param variant_type: :param dist: :return is_splice_site_overlap, exon_i, is_right_overlap (Higher genomic position --> True) """ exons = tx.get_exons() strand = tx.get_strand() # If this is an insertion, we only want to count a splice site if it starts in the splice site regions if variant_type == VariantClassification.VT_INS: end_genomic_space = start_genomic_space for i,exon in enumerate(exons): is_internal_exon = (i > 0) and (i < (len(exons)-1)) is_check_left = is_internal_exon or (strand == "-" and i == 0) or (strand == "+" and i == (len(exons)-1)) is_check_right = is_internal_exon or (strand == "+" and i == 0) or (strand == "-" and i == (len(exons)-1)) if is_check_left: splice_site_left = (exon[0]-dist+1, exon[0]+(dist-1)+1) overlap_type_left = TranscriptProviderUtils.test_overlap(start_genomic_space, end_genomic_space, splice_site_left[0], splice_site_left[1]) if overlap_type_left: return True, i, False if is_check_right: splice_site_right = (exon[1]-(dist-1), exon[1] + dist) overlap_type_right = TranscriptProviderUtils.test_overlap(start_genomic_space, end_genomic_space, splice_site_right[0], splice_site_right[1]) if overlap_type_right: return True, i, True return False, -1, None, False
def generate_transcript_change_from_tx(self, tx, variant_type, vc, start_genomic_space, end_genomic_space, ref_allele, alt_allele): """ :param vc: :return: """ if vc.get_vc( ) == VariantClassification.SPLICE_SITE and vc.get_secondary_vc( ) == VariantClassification.INTRON: return "" # dist_from_exon = self._get_splice_site_coordinates(tx, start_genomic_space, end_genomic_space, vc.get_exon_i()) # exon_i = vc.get_exon_i() # return TranscriptProviderUtils.render_splice_site_transcript_change(tx, dist_from_exon, exon_i, vc.get_secondary_vc() == VariantClassification.INTRON) if vc.get_cds_start_in_exon_space( ) == "" or vc.get_cds_start_in_exon_space() < 0: return "" exon_position_start, exon_position_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space( int(start_genomic_space), int(end_genomic_space), tx) if tx.get_strand() == "-": cds_position_start_cds_space = exon_position_start - int( vc.get_cds_start_in_exon_space()) + 1 cds_position_end_cds_space = exon_position_end - int( vc.get_cds_start_in_exon_space()) + 1 else: cds_position_start_cds_space = exon_position_start - int( vc.get_cds_start_in_exon_space()) cds_position_end_cds_space = exon_position_end - int( vc.get_cds_start_in_exon_space()) observed_allele_stranded, reference_allele_stranded = self._get_stranded_alleles( ref_allele, alt_allele, tx) result = TranscriptProviderUtils.render_transcript_change( variant_type, vc.get_vc(), cds_position_start_cds_space, cds_position_end_cds_space, reference_allele_stranded, observed_allele_stranded, vc.get_secondary_vc()) return result
def _determine_de_novo_old(self, vc, transcript_position_start, transcript_position_end, ref, alt, tx, variant_type): """Returns input vc if not de Novo. Otherwise, returns updated variant classification. :param vc: Current variant classification. Note that if this is not 5'UTR, this method will just return this input. :param transcript_position_start: :param transcript_position_end: :param ref: (str) Does not take into account strandedness (e.g. m.ref_allele) :param alt: (str) Does not take into account strandedness (e.g. m.alt_allele) :param tx: transcript :param variant_type: Will always return original vc if the vc is not None.""" result = vc if vc == VariantClassification.FIVE_PRIME_UTR and ref != alt: observed_allele_stranded = self._determine_stranded_allele(alt, tx.get_strand()) reference_allele_stranded = self._determine_stranded_allele(ref, tx.get_strand()) tx_seq = tx.get_seq() if variant_type == VariantClassification.VT_INS: if tx.get_strand() == "-": transcript_position_start = transcript_position_end else: transcript_position_end = transcript_position_start utr_region_start, utr_region_end = transcript_position_start-2, transcript_position_end+2 # TODO: This may not work for "+" strand. Need unit test. utr_region_seq = tx_seq[utr_region_start:utr_region_end+1] mutated_utr_region_seq = TranscriptProviderUtils.mutate_reference_sequence(utr_region_seq, utr_region_start, transcript_position_start, transcript_position_end, observed_allele_stranded, variant_type) # Check for Denovo ATG_position = mutated_utr_region_seq.find('ATG') if ATG_position > -1: cds_start_in_exon_space, cds_end_in_exon_space = TranscriptProviderUtils.determine_cds_in_exon_space(tx) ATG_position = utr_region_start + ATG_position + 1 if (cds_start_in_exon_space - ATG_position) % 3 == 0: frameness = 'InFrame' else: frameness = 'OutOfFrame' result = 'De_novo_Start_' + frameness return result
def _calculate_protein_sequence(self, exons, seq, cds_start_genomic_space, cds_stop_genomic_space, strand): cds_start_exon_space, cds_stop_exon_space = TranscriptProviderUtils._convert_genomic_space_to_feature_space( int(cds_start_genomic_space), int(cds_stop_genomic_space), exons, strand) prot_seq = MutUtils.translate_sequence( seq[int(cds_start_exon_space):int(cds_stop_exon_space)]) if len(prot_seq) > 0 and prot_seq[-1] == '*': prot_seq = prot_seq[:-1] return prot_seq
def _extract_exon_info(self, position, tx): """ Create basic information about the given position relative to the transcript. :param int position: in genomic space :param Transcript tx: :return tuple: [0]: closest exon index of the position (0-based), [1]: whether the distance was left in genomic space (false for overlap) [2]: whether the position overlaps an exon """ exon_index = TranscriptProviderUtils.determine_closest_exon(tx, position, position) if exon_index is None: return exon_index, None, None, None left_distance, right_distance = TranscriptProviderUtils.determine_closest_distance_from_exon(position, position, exon_index, tx) is_in_exon = (left_distance <= 0) and (right_distance >= 0) is_diff_is_positive = (left_distance > 0) and (right_distance > 0) is_negative_strand = (tx.get_strand() == "-") return exon_index, is_diff_is_positive, is_in_exon, is_negative_strand
def __get_overlapping_records(self, records, start, end, type): if type == 'gene': st_key, en_key = 'start', 'end' elif type == 'transcript': st_key, en_key = 'footprint_start', 'footprint_end' out_records = list() for r in records: if TranscriptProviderUtils.test_overlap(start, end, r[st_key], r[en_key]): out_records.append(r) return out_records
def __get_overlapping_records(self, records, start, end, type): if type == "gene": st_key, en_key = "start", "end" elif type == "transcript": st_key, en_key = "footprint_start", "footprint_end" out_records = list() for r in records: if TranscriptProviderUtils.test_overlap(start, end, r[st_key], r[en_key]): out_records.append(r) return out_records
def annotate_mutation(self, mutation, upstream_padding=3000, downstream_padding=0): mutation.createAnnotation('variant_type', TranscriptProviderUtils.infer_variant_type(mutation.ref_allele, mutation.alt_allele), self.title) data = [mutation] data = gaf_annotation.find_mut_in_gaf(data, self) data = gaf_annotation.identify_best_effect_transcript(data, self) data = gaf_annotation.identify_best_canonical_transcript(data, self) data = gaf_annotation.correct_transcript_coordinates(data, self) data = gaf_annotation.infer_output_fields(data, self) data = self._annotateMutationFromTranscripts(data) annotated_mutation = data.next() return annotated_mutation
def generate_codon_change_from_vc(self, t, start, end, vc): """ :param t: (Transcript) :param start: (int) :param end: (int) :param vc: (VariantClassification) :return: """ dist_from_exon = self._get_splice_site_coordinates( t, start, end, vc.get_exon_i()) exon_i = vc.get_exon_i() if vc.get_vc( ) == VariantClassification.SPLICE_SITE and vc.get_secondary_vc( ) == VariantClassification.INTRON: return TranscriptProviderUtils.render_intronic_splice_site_codon_change( dist_from_exon, exon_i) if vc.get_ref_codon_start_in_exon( ) == "" or vc.get_ref_codon_end_in_exon() == "": return "" codon_position_start_cds_space = int( vc.get_ref_codon_start_in_exon()) - int( vc.get_cds_start_in_exon_space()) + 1 codon_position_end_cds_space = int( vc.get_ref_codon_end_in_exon()) - int( vc.get_cds_start_in_exon_space()) + 1 ref_codon_seq = vc.get_ref_codon() alt_codon_seq = vc.get_alt_codon() result = TranscriptProviderUtils.render_codon_change( vc.get_vt(), vc.get_vc(), int(codon_position_start_cds_space), int(codon_position_end_cds_space), ref_codon_seq, alt_codon_seq, dist_from_exon, exon_i, vc.get_secondary_vc()) return result
def generate_protein_change_from_vc(self, vc): """ :param vc: VariantClassification :return: """ prot_position_start = vc.get_ref_protein_start() prot_position_end = vc.get_ref_protein_end() if prot_position_start == "" or prot_position_end == "": return "" ref_prot_allele = vc.get_ref_aa() alt_prot_allele = vc.get_alt_aa() result = TranscriptProviderUtils.render_protein_change(vc.get_vt(), vc.get_vc(), int(prot_position_start), int(prot_position_end), ref_prot_allele, alt_prot_allele, vc.get_secondary_vc()) return result
def _get_splice_site_coordinates(self, t, start, end, exon_i): """Returns distance from exon.""" left_diff, right_diff = TranscriptProviderUtils.determine_closest_distance_from_exon(start, end, exon_i, t) if abs(left_diff) < abs(right_diff): dist_from_exon = left_diff * -1 if dist_from_exon > -1: dist_from_exon = -1 elif abs(right_diff) < abs(left_diff): dist_from_exon = right_diff * -1 if dist_from_exon < 1: dist_from_exon = 1 else: dist_from_exon = 0 if t.get_strand() == "-": dist_from_exon *= -1 return dist_from_exon
def generate_protein_change_from_vc(self, vc): """ :param vc: VariantClassification :return: """ prot_position_start = vc.get_ref_protein_start() prot_position_end = vc.get_ref_protein_end() if prot_position_start == "" or prot_position_end == "": return "" ref_prot_allele = vc.get_ref_aa() alt_prot_allele = vc.get_alt_aa() result = TranscriptProviderUtils.render_protein_change( vc.get_vt(), vc.get_vc(), int(prot_position_start), int(prot_position_end), ref_prot_allele, alt_prot_allele, vc.get_secondary_vc()) return result
def _get_splice_site_coordinates(self, t, start, end, exon_i): """Returns distance from exon.""" left_diff, right_diff = TranscriptProviderUtils.determine_closest_distance_from_exon( start, end, exon_i, t) if abs(left_diff) < abs(right_diff): dist_from_exon = left_diff * -1 if dist_from_exon > -1: dist_from_exon = -1 elif abs(right_diff) < abs(left_diff): dist_from_exon = right_diff * -1 if dist_from_exon < 1: dist_from_exon = 1 else: dist_from_exon = 0 if t.get_strand() == "-": dist_from_exon *= -1 return dist_from_exon
def test_convert_genomic_space_to_exon_space(self, loc, gt_d): """Test genomic --> exon transform on real data. """ gencode_input_gtf = "testdata/gencode/MAPK1.gencode.v18.annotation.gtf" gencode_input_fasta = "testdata/gencode/MAPK1.gencode.v18.pc_transcripts.fa" base_output_filename = "out/test_variant_classification" shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True) genome_build_factory = GenomeBuildFactory() genome_build_factory.construct_ensembl_indices([gencode_input_gtf], [gencode_input_fasta], base_output_filename) ensembl_ds = EnsemblTranscriptDatasource(base_output_filename, version="TEST") tx = ensembl_ds.get_overlapping_transcripts("22", "22108790", "22108790") start, end = TranscriptProviderUtils.convert_genomic_space_to_exon_space(loc[0], loc[1], tx[0]) loc_length = (int(loc[1]) - int(loc[0])) self.assertTrue((end - start) == loc_length, str(end) + " - " + str(start) + " was not correct length: " + str(loc_length)) self.assertTrue(start == gt_d, "start position (" + str(start) + ") did not match gt (" + str(end) + ")" + " exons: " + str(tx[0].get_exons()))
def test_querying_transcripts_by_region(self): """Test web api backend call /transcripts/.... """ datasource_list = DatasourceFactory.createDatasources( self._determine_db_dir(), "hg19", isMulticore=False) annotator = Annotator() for ds in datasource_list: annotator.addDatasource(ds) txs = annotator.retrieve_transcripts_by_region("4", 50164411, 60164411) self.assertTranscriptsFound(txs) ## Here is an example of getting enough data to populate the json in doc/transcript_json_commented.json.txt # None of these values are validated. for tx in txs: transcript_id = tx.get_transcript_id() tx_start = tx.determine_transcript_start() tx_end = tx.determine_transcript_stop() gene = tx.get_gene() chr = tx.get_contig() n_exons = len(tx.get_exons()) strand = tx.get_strand() footprint_start, footprint_end = tx.determine_cds_footprint() klass = tx.get_gene_type() cds_start = tx.determine_cds_start() cds_end = tx.determine_cds_stop() id = tx.get_gene_id() genomic_coords = [[exon[0], exon[1]] for exon in tx.get_exons()] transcript_coords = [[ TranscriptProviderUtils.convert_genomic_space_to_exon_space( exon[0] + 1, exon[1], tx) ] for exon in tx.get_exons()] code_len = int(cds_end) - int(cds_start) + 1 # If refseq datasources are not available, this will fail. # Step 2 annotate the transcript, which produces a dummy mutation with the refseq annotations. dummy_mut = annotator.annotate_transcript(tx) refseq_mRNA_id = dummy_mut["gencode_xref_refseq_mRNA_id"] refseq_prot_id = dummy_mut["gencode_xref_refseq_prot_acc"] # Description is unavailable right now description = "" self.assertTrue(refseq_mRNA_id is not None) self.assertTrue(refseq_prot_id is not None) self.assertTrue(len(transcript_coords) == n_exons)
def test_querying_transcripts_by_region(self): """Test web api backend call /transcripts/.... """ datasource_list = DatasourceFactory.createDatasources(self._determine_db_dir(), "hg19", isMulticore=False) annotator = Annotator() for ds in datasource_list: annotator.addDatasource(ds) txs = annotator.retrieve_transcripts_by_region("4", 50164411, 60164411) self.assertTranscriptsFound(txs) ## Here is an example of getting enough data to populate the json in doc/transcript_json_commented.json.txt # None of these values are validated. for tx in txs: transcript_id = tx.get_transcript_id() tx_start = tx.determine_transcript_start() tx_end = tx.determine_transcript_stop() gene = tx.get_gene() chr = tx.get_contig() n_exons = len(tx.get_exons()) strand = tx.get_strand() footprint_start, footprint_end = tx.determine_cds_footprint() klass = tx.get_gene_type() cds_start = tx.determine_cds_start() cds_end = tx.determine_cds_stop() id = tx.get_gene_id() genomic_coords = [[exon[0], exon[1]] for exon in tx.get_exons()] transcript_coords = [ [TranscriptProviderUtils.convert_genomic_space_to_exon_space(exon[0] + 1, exon[1], tx)] for exon in tx.get_exons() ] code_len = int(cds_end) - int(cds_start) + 1 # If refseq datasources are not available, this will fail. # Step 2 annotate the transcript, which produces a dummy mutation with the refseq annotations. dummy_mut = annotator.annotate_transcript(tx) refseq_mRNA_id = dummy_mut["gencode_xref_refseq_mRNA_id"] refseq_prot_id = dummy_mut["gencode_xref_refseq_prot_acc"] # Description is unavailable right now description = "" self.assertTrue(refseq_mRNA_id is not None) self.assertTrue(refseq_prot_id is not None) self.assertTrue(len(transcript_coords) == n_exons)
def annotate_mutation(self, mutation, upstream_padding=3000, downstream_padding=0): mutation.createAnnotation( 'variant_type', TranscriptProviderUtils.infer_variant_type(mutation.ref_allele, mutation.alt_allele), self.title) data = [mutation] data = gaf_annotation.find_mut_in_gaf(data, self) data = gaf_annotation.identify_best_effect_transcript(data, self) data = gaf_annotation.identify_best_canonical_transcript(data, self) data = gaf_annotation.correct_transcript_coordinates(data, self) data = gaf_annotation.infer_output_fields(data, self) data = self._annotateMutationFromTranscripts(data) annotated_mutation = data.next() return annotated_mutation
def _get_overlapping_transcript_records(self, records, start, end): return [ r for r in records if TranscriptProviderUtils.test_overlap( int(start), int(end), r.get_start(), r.get_end()) ]
def variant_classify(self, tx, ref_allele, alt_allele, start, end, variant_type, dist=2): """Perform classifications. Everything handled in genomic space *RNA* x'UTR Splice_Site (Intron) Intron Splice_Site (Exon) {Missense, Silent} {Nonsense, Silent} {Nonstop, Silent} IGR x'Flank De_novo_Start """ gene_type = tx.get_gene_type() if gene_type != "protein_coding": if gene_type == VariantClassification.LINCRNA: return VariantClassification(VariantClassification.LINCRNA, variant_type, tx.get_transcript_id()) else: return VariantClassification(VariantClassification.RNA, variant_type, tx.get_transcript_id()) if ref_allele == "-": ref_allele = "" if alt_allele == "-": alt_allele = "" s = int(start) e = int(end) is_exon_overlap = TranscriptProviderUtils.determine_if_exon_overlap(s, e, tx, variant_type) is_splice_site_tuple = self._determine_if_splice_site_overlap(s, e, tx, variant_type, dist) is_splice_site = is_splice_site_tuple[0] is_beyond_exons, side, is_flank = self._determine_beyond_exon_info_vt(start, end, tx, variant_type) if not is_exon_overlap and not is_beyond_exons: exon_i = TranscriptProviderUtils.determine_closest_exon(tx, int(start), int(end)) if is_splice_site: # Intron Splice Site return VariantClassification(VariantClassification.SPLICE_SITE, variant_type, tx.get_transcript_id(), vc_secondary=VariantClassification.INTRON, exon_i=exon_i) else: return VariantClassification(VariantClassification.INTRON, variant_type, tx.get_transcript_id(), exon_i=exon_i) if not is_exon_overlap and is_beyond_exons: if is_flank: # Flanks if side.startswith("3"): return VariantClassification(VariantClassification.THREE_PRIME_PRIME_FLANK, variant_type, transcript_id=tx.get_transcript_id()) else: return VariantClassification(VariantClassification.FIVE_PRIME_PRIME_FLANK, variant_type, transcript_id=tx.get_transcript_id()) else: # IGR return VariantClassification(VariantClassification.IGR, variant_type) is_start_codon_overlap = self._determine_codon_overlap(s, e, tx.get_start_codon(), variant_type) is_stop_codon_overlap = self._determine_codon_overlap(s, e, tx.get_stop_codon(), variant_type) if is_start_codon_overlap and not variant_type.endswith("NP"): return VariantClassification('Start_Codon_' + variant_type.capitalize(), variant_type, transcript_id=tx.get_transcript_id()) if is_stop_codon_overlap and not variant_type.endswith("NP"): return VariantClassification('Stop_Codon_' + variant_type.capitalize(), variant_type, transcript_id=tx.get_transcript_id()) is_cds_overlap = self._determine_if_cds_overlap(s, e, tx, variant_type) if is_exon_overlap and not is_cds_overlap and not is_start_codon_overlap and not is_stop_codon_overlap: # UTR if side.startswith("3"): vc_tmp = VariantClassification.THREE_PRIME_UTR else: vc_tmp = VariantClassification.FIVE_PRIME_UTR transcript_position_exon_space_start, transcript_position_exon_space_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space(start, end, tx) vc = self._determine_de_novo(vc_tmp, transcript_position_exon_space_start, ref_allele, alt_allele, tx, variant_type) return VariantClassification(vc, variant_type, transcript_id=tx.get_transcript_id(), ) # We have a clean overlap in the CDS. Includes start codon or stop codon. if is_cds_overlap or is_stop_codon_overlap or is_start_codon_overlap: is_frameshift_indel = self.is_frameshift_indel(variant_type, int(start), int(end), alt_allele) return self._determine_vc_for_cds_overlap(start, end, ref_allele, alt_allele, is_frameshift_indel, is_splice_site, tx, variant_type, is_start_codon_overlap) raise ValueError("Could not determine variant classification: " + tx.get_trancript_id() + " " + str([ref_allele, alt_allele, start, end]))
def test_determine_closest_distance_from_exon_in_exon(self): tx = self.retrieve_test_transcript_MAPK1() # Right in exon 1 left_diff, right_diff = TranscriptProviderUtils.determine_closest_distance_from_exon(22162000, 22162005, 1, tx) self.assertTrue(left_diff < 0 and right_diff > 0, "left distance should be negative while right distance should be positive.")
def variant_classify(self, tx, ref_allele, alt_allele, start, end, variant_type, dist=2): """Perform classifications. Everything handled in genomic space *RNA* x'UTR Splice_Site (Intron) Intron Splice_Site (Exon) {Missense, Silent} {Nonsense, Silent} {Nonstop, Silent} IGR x'Flank De_novo_Start """ gene_type = tx.get_gene_type() if gene_type != "protein_coding": if gene_type == VariantClassification.LINCRNA: return VariantClassification(VariantClassification.LINCRNA, variant_type, tx.get_transcript_id()) else: return VariantClassification(VariantClassification.RNA, variant_type, tx.get_transcript_id()) if ref_allele == "-": ref_allele = "" if alt_allele == "-": alt_allele = "" s = int(start) e = int(end) is_exon_overlap = TranscriptProviderUtils.determine_if_exon_overlap( s, e, tx, variant_type) is_splice_site_tuple = self._determine_if_splice_site_overlap( s, e, tx, variant_type, dist) is_splice_site = is_splice_site_tuple[0] is_beyond_exons, side, is_flank = self._determine_beyond_exon_info_vt( start, end, tx, variant_type) if not is_exon_overlap and not is_beyond_exons: exon_i = TranscriptProviderUtils.determine_closest_exon( tx, int(start), int(end)) if is_splice_site: # Intron Splice Site return VariantClassification( VariantClassification.SPLICE_SITE, variant_type, tx.get_transcript_id(), vc_secondary=VariantClassification.INTRON, exon_i=exon_i) else: return VariantClassification(VariantClassification.INTRON, variant_type, tx.get_transcript_id(), exon_i=exon_i) if not is_exon_overlap and is_beyond_exons: if is_flank: # Flanks if side.startswith("3"): return VariantClassification( VariantClassification.THREE_PRIME_PRIME_FLANK, variant_type, transcript_id=tx.get_transcript_id()) else: return VariantClassification( VariantClassification.FIVE_PRIME_PRIME_FLANK, variant_type, transcript_id=tx.get_transcript_id()) else: # IGR return VariantClassification(VariantClassification.IGR, variant_type) is_start_codon_overlap = self._determine_codon_overlap( s, e, tx.get_start_codon(), variant_type) is_stop_codon_overlap = self._determine_codon_overlap( s, e, tx.get_stop_codon(), variant_type) if is_start_codon_overlap and not variant_type.endswith("NP"): return VariantClassification('Start_Codon_' + variant_type.capitalize(), variant_type, transcript_id=tx.get_transcript_id()) if is_stop_codon_overlap and not variant_type.endswith("NP"): return VariantClassification('Stop_Codon_' + variant_type.capitalize(), variant_type, transcript_id=tx.get_transcript_id()) is_cds_overlap = self._determine_if_cds_overlap(s, e, tx, variant_type) if is_exon_overlap and not is_cds_overlap and not is_start_codon_overlap and not is_stop_codon_overlap: # UTR if side.startswith("3"): vc_tmp = VariantClassification.THREE_PRIME_UTR else: vc_tmp = VariantClassification.FIVE_PRIME_UTR transcript_position_exon_space_start, transcript_position_exon_space_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space( start, end, tx) vc = self._determine_de_novo(vc_tmp, transcript_position_exon_space_start, ref_allele, alt_allele, tx, variant_type) return VariantClassification( vc, variant_type, transcript_id=tx.get_transcript_id(), ) # We have a clean overlap in the CDS. Includes start codon or stop codon. if is_cds_overlap or is_stop_codon_overlap or is_start_codon_overlap: is_frameshift_indel = self.is_frameshift_indel( variant_type, int(start), int(end), alt_allele) return self._determine_vc_for_cds_overlap( start, end, ref_allele, alt_allele, is_frameshift_indel, is_splice_site, tx, variant_type, is_start_codon_overlap) raise ValueError("Could not determine variant classification: " + tx.get_trancript_id() + " " + str([ref_allele, alt_allele, start, end]))
def test_render_protein_change(self, variant_type, variant_classification, secondary_vc, prot_position_start, prot_position_end, ref_prot_allele, alt_prot_allele, strand, gt): """Simple test of protein change, once parameters have been rendered. """ guess = TranscriptProviderUtils.render_protein_change(variant_type, variant_classification, prot_position_start, prot_position_end, ref_prot_allele, alt_prot_allele, secondary_vc) self.assertTrue(guess == gt, "Incorrect guess gt <> guess: %s <> %s" % (gt, guess))
def test_render_transcript_change(self, variant_type, vc, exon_position_start, exon_position_end, ref_allele_stranded, alt_allele_stranded, gt, secondary_vc): """Simple test of transcript change, once parameters have been rendered. """ guess = TranscriptProviderUtils.render_transcript_change(variant_type, vc, exon_position_start, exon_position_end, ref_allele_stranded, alt_allele_stranded, secondary_vc) self.assertTrue(guess == gt, "Incorrect guess gt <> guess: %s <> %s" % (gt, guess))
def _determine_vc_for_cds_overlap(self, start, end, ref_allele, alt_allele, is_frameshift_indel, is_splice_site, tx, variant_type, is_start_codon): """ Note: This method can also handle start and stop codons. :param start: :param end: :param ref_allele: :param alt_allele: :param is_frameshift_indel: :param is_splice_site: :param tx: :param variant_type: :return: """ observed_allele_stranded, reference_allele_stranded = self._get_stranded_alleles( ref_allele, alt_allele, tx) transcript_position_start, transcript_position_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space( start, end, tx) if tx.get_strand( ) == "+" and not variant_type == VariantClassification.VT_INS: transcript_position_start -= 1 transcript_position_end -= 1 transcript_seq = tx.get_seq() protein_seq = tx.get_protein_seq() cds_start, cds_stop = TranscriptProviderUtils.determine_cds_in_exon_space( tx) protein_position_start, protein_position_end = TranscriptProviderUtils.get_protein_positions( transcript_position_start, transcript_position_end, cds_start) new_ref_transcript_seq = transcript_seq if (transcript_seq[transcript_position_start:transcript_position_end + 1] != reference_allele_stranded ) and variant_type != VariantClassification.VT_INS: new_ref_transcript_seq = list(transcript_seq) new_ref_transcript_seq[ transcript_position_start:transcript_position_end + 1] = reference_allele_stranded new_ref_transcript_seq = ''.join(new_ref_transcript_seq) ref_tx_seq_has_been_changed = True else: ref_tx_seq_has_been_changed = False cds_codon_start, cds_codon_end = TranscriptProviderUtils.get_cds_codon_positions( protein_position_start, protein_position_end, cds_start) if variant_type == "DEL": reference_codon_seq = new_ref_transcript_seq[ cds_codon_start:cds_codon_end + 1].lower() else: reference_codon_seq = TranscriptProviderUtils.mutate_reference_sequence( new_ref_transcript_seq[cds_codon_start:cds_codon_end + 1].lower(), cds_codon_start, transcript_position_start, transcript_position_end, reference_allele_stranded, variant_type) if variant_type == "INS" and tx.get_strand() == "-": mutated_codon_seq = TranscriptProviderUtils.mutate_reference_sequence( reference_codon_seq.lower(), cds_codon_start - 1, transcript_position_start, transcript_position_end, observed_allele_stranded, variant_type) else: mutated_codon_seq = TranscriptProviderUtils.mutate_reference_sequence( reference_codon_seq.lower(), cds_codon_start, transcript_position_start, transcript_position_end, observed_allele_stranded, variant_type) observed_aa = MutUtils.translate_sequence(mutated_codon_seq) if ref_tx_seq_has_been_changed: reference_aa = MutUtils.translate_sequence(reference_codon_seq) else: reference_aa = protein_seq[protein_position_start - 1:protein_position_end] if variant_type != VariantClassification.VT_SNP: try: reference_aa, observed_aa, protein_position_start, protein_position_end = \ self._adjust_protein_position_and_alleles(protein_seq, protein_position_start, protein_position_end, reference_aa, observed_aa) except InvalidVariantException as ive: logging.getLogger(__name__).error( "Could not properly adjust protein position for variant: %s, %s, %s, %s, %s VT: %s" % (tx.get_contig(), start, end, ref_allele, alt_allele, variant_type)) logging.getLogger(__name__).error(str(ive)) logging.getLogger(__name__).warn( "Above error may not have exact start and end positions if this is a VCF input." ) logging.getLogger(__name__).warn( "Variant type is likely incorrect. This can happen with some GATK VCFs" ) logging.getLogger(__name__).warn( TranscriptProviderUtils.is_valid_xNP( variant_type, ref_allele, alt_allele)) logging.getLogger(__name__).warn( "The protein_change annotation may not be properly rendered." ) vc_tmp, vc_tmp_secondary = self.infer_variant_classification( variant_type, reference_aa, observed_aa, ref_allele, alt_allele, is_frameshift_indel=is_frameshift_indel, is_splice_site=is_splice_site, is_start_codon=is_start_codon) cds_start_exon_space, cds_end_exon_space = TranscriptProviderUtils.determine_cds_in_exon_space( tx) exon_i = TranscriptProviderUtils.determine_exon_index( int(start), int(end), tx, variant_type) final_vc = VariantClassification( vc_tmp, variant_type, transcript_id=tx.get_transcript_id(), alt_codon=mutated_codon_seq, ref_codon=reference_codon_seq, ref_aa=reference_aa, ref_protein_start=protein_position_start, ref_protein_end=protein_position_end, alt_aa=observed_aa, alt_codon_start_in_exon=cds_codon_start, alt_codon_end_in_exon=cds_codon_end, ref_codon_start_in_exon=cds_codon_start, ref_codon_end_in_exon=cds_codon_end, cds_start_in_exon_space=cds_start_exon_space, ref_allele_stranded=reference_allele_stranded, alt_allele_stranded=observed_allele_stranded, exon_i=exon_i, vc_secondary=vc_tmp_secondary) return final_vc
def test_transform_to_feature_space(self, exons, s, gt, strand): """Run some basic tests transforming genomic coordinates to exon coordinates, taking strand into account. """ guess = TranscriptProviderUtils._transform_to_feature_space(exons, s, strand) self.assertTrue(guess == gt, "Did not transform genomic to exon space properly: " + str(exons) + " pos: " + str(s) + " strand: " + strand + " guess/gt: " + str(guess) + "/" + str(gt))