def _determine_de_novo(self, vc_str, exon_start, ref, alt, tx, variant_type, buffer=2 ): """Returns input vc if not de Novo. Otherwise, returns updated variant classification. :param exon_start: :param buffer: :param vc_str: Current variant classification. Note that if this is not 5'UTR, this method will just return this input. :param ref: (str) Does not take into account strandedness (e.g. m.ref_allele) :param alt: (str) Does not take into account strandedness (e.g. m.alt_allele) :param tx: transcript :param variant_type: Will always return original vc if the vc is not None.""" result = vc_str if vc_str == VariantClassification.FIVE_PRIME_UTR and ref != alt: mutated_utr_region = self._mutate_exon(tx, ref, alt, variant_type, exon_start, buffer) atg_position = mutated_utr_region.find('ATG') if atg_position > -1: atg_exon_position = exon_start + atg_position - buffer cds_start_in_exon_space, cds_end_in_exon_space = TranscriptProviderUtils.determine_cds_in_exon_space(tx) if (cds_start_in_exon_space - atg_exon_position) % 3 == 0: frameness = 'InFrame' else: frameness = 'OutOfFrame' result = 'De_novo_Start_' + frameness return result
def _determine_de_novo(self, vc_str, exon_start, ref, alt, tx, variant_type, buffer=2): """Returns input vc if not de Novo. Otherwise, returns updated variant classification. :param exon_start: :param buffer: :param vc_str: Current variant classification. Note that if this is not 5'UTR, this method will just return this input. :param ref: (str) Does not take into account strandedness (e.g. m.ref_allele) :param alt: (str) Does not take into account strandedness (e.g. m.alt_allele) :param tx: transcript :param variant_type: Will always return original vc if the vc is not None.""" result = vc_str if vc_str == VariantClassification.FIVE_PRIME_UTR and ref != alt: mutated_utr_region = self._mutate_exon(tx, ref, alt, variant_type, exon_start, buffer) atg_position = mutated_utr_region.find('ATG') if atg_position > -1: atg_exon_position = exon_start + atg_position - buffer cds_start_in_exon_space, cds_end_in_exon_space = TranscriptProviderUtils.determine_cds_in_exon_space( tx) if (cds_start_in_exon_space - atg_exon_position) % 3 == 0: frameness = 'InFrame' else: frameness = 'OutOfFrame' result = 'De_novo_Start_' + frameness return result
def test_codon_single_base(self, start, end, ref_base_stranded, gt_codon): """Test that we can grab the proper three bases of a codon for an arbitrary single base """ tx = self.retrieve_test_transcript_MAPK1() transcript_position_start, transcript_position_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space(start, end, tx) cds_start, cds_stop = TranscriptProviderUtils.determine_cds_in_exon_space(tx) protein_position_start, protein_position_end = TranscriptProviderUtils.get_protein_positions(transcript_position_start, transcript_position_end, cds_start) cds_codon_start, cds_codon_end = TranscriptProviderUtils.get_cds_codon_positions(protein_position_start, protein_position_end, cds_start) codon_seq = tx.get_seq()[cds_codon_start:cds_codon_end+1] self.assertTrue(codon_seq == gt_codon, "Did not get correct codon (%s): %s loc: %s-%s" %(gt_codon, codon_seq, start, end))
def _determine_de_novo_old(self, vc, transcript_position_start, transcript_position_end, ref, alt, tx, variant_type): """Returns input vc if not de Novo. Otherwise, returns updated variant classification. :param vc: Current variant classification. Note that if this is not 5'UTR, this method will just return this input. :param transcript_position_start: :param transcript_position_end: :param ref: (str) Does not take into account strandedness (e.g. m.ref_allele) :param alt: (str) Does not take into account strandedness (e.g. m.alt_allele) :param tx: transcript :param variant_type: Will always return original vc if the vc is not None.""" result = vc if vc == VariantClassification.FIVE_PRIME_UTR and ref != alt: observed_allele_stranded = self._determine_stranded_allele( alt, tx.get_strand()) reference_allele_stranded = self._determine_stranded_allele( ref, tx.get_strand()) tx_seq = tx.get_seq() if variant_type == VariantClassification.VT_INS: if tx.get_strand() == "-": transcript_position_start = transcript_position_end else: transcript_position_end = transcript_position_start utr_region_start, utr_region_end = transcript_position_start - 2, transcript_position_end + 2 # TODO: This may not work for "+" strand. Need unit test. utr_region_seq = tx_seq[utr_region_start:utr_region_end + 1] mutated_utr_region_seq = TranscriptProviderUtils.mutate_reference_sequence( utr_region_seq, utr_region_start, transcript_position_start, transcript_position_end, observed_allele_stranded, variant_type) # Check for Denovo ATG_position = mutated_utr_region_seq.find('ATG') if ATG_position > -1: cds_start_in_exon_space, cds_end_in_exon_space = TranscriptProviderUtils.determine_cds_in_exon_space( tx) ATG_position = utr_region_start + ATG_position + 1 if (cds_start_in_exon_space - ATG_position) % 3 == 0: frameness = 'InFrame' else: frameness = 'OutOfFrame' result = 'De_novo_Start_' + frameness return result
def _determine_de_novo_old(self, vc, transcript_position_start, transcript_position_end, ref, alt, tx, variant_type): """Returns input vc if not de Novo. Otherwise, returns updated variant classification. :param vc: Current variant classification. Note that if this is not 5'UTR, this method will just return this input. :param transcript_position_start: :param transcript_position_end: :param ref: (str) Does not take into account strandedness (e.g. m.ref_allele) :param alt: (str) Does not take into account strandedness (e.g. m.alt_allele) :param tx: transcript :param variant_type: Will always return original vc if the vc is not None.""" result = vc if vc == VariantClassification.FIVE_PRIME_UTR and ref != alt: observed_allele_stranded = self._determine_stranded_allele(alt, tx.get_strand()) reference_allele_stranded = self._determine_stranded_allele(ref, tx.get_strand()) tx_seq = tx.get_seq() if variant_type == VariantClassification.VT_INS: if tx.get_strand() == "-": transcript_position_start = transcript_position_end else: transcript_position_end = transcript_position_start utr_region_start, utr_region_end = transcript_position_start-2, transcript_position_end+2 # TODO: This may not work for "+" strand. Need unit test. utr_region_seq = tx_seq[utr_region_start:utr_region_end+1] mutated_utr_region_seq = TranscriptProviderUtils.mutate_reference_sequence(utr_region_seq, utr_region_start, transcript_position_start, transcript_position_end, observed_allele_stranded, variant_type) # Check for Denovo ATG_position = mutated_utr_region_seq.find('ATG') if ATG_position > -1: cds_start_in_exon_space, cds_end_in_exon_space = TranscriptProviderUtils.determine_cds_in_exon_space(tx) ATG_position = utr_region_start + ATG_position + 1 if (cds_start_in_exon_space - ATG_position) % 3 == 0: frameness = 'InFrame' else: frameness = 'OutOfFrame' result = 'De_novo_Start_' + frameness return result
def _determine_vc_for_cds_overlap(self, start, end, ref_allele, alt_allele, is_frameshift_indel, is_splice_site, tx, variant_type, is_start_codon): """ Note: This method can also handle start and stop codons. :param start: :param end: :param ref_allele: :param alt_allele: :param is_frameshift_indel: :param is_splice_site: :param tx: :param variant_type: :return: """ observed_allele_stranded, reference_allele_stranded = self._get_stranded_alleles(ref_allele, alt_allele, tx) transcript_position_start, transcript_position_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space( start, end, tx) if tx.get_strand() == "+" and not variant_type == VariantClassification.VT_INS: transcript_position_start -= 1 transcript_position_end -= 1 transcript_seq = tx.get_seq() protein_seq = tx.get_protein_seq() cds_start, cds_stop = TranscriptProviderUtils.determine_cds_in_exon_space(tx) protein_position_start, protein_position_end = TranscriptProviderUtils.get_protein_positions( transcript_position_start, transcript_position_end, cds_start) new_ref_transcript_seq = transcript_seq if (transcript_seq[transcript_position_start:transcript_position_end+1] != reference_allele_stranded) and variant_type != VariantClassification.VT_INS: new_ref_transcript_seq = list(transcript_seq) new_ref_transcript_seq[transcript_position_start:transcript_position_end+1] = reference_allele_stranded new_ref_transcript_seq = ''.join(new_ref_transcript_seq) ref_tx_seq_has_been_changed = True else: ref_tx_seq_has_been_changed = False cds_codon_start, cds_codon_end = TranscriptProviderUtils.get_cds_codon_positions(protein_position_start, protein_position_end, cds_start) if variant_type == "DEL": reference_codon_seq = new_ref_transcript_seq[cds_codon_start:cds_codon_end+1].lower() else: reference_codon_seq = TranscriptProviderUtils.mutate_reference_sequence(new_ref_transcript_seq[cds_codon_start:cds_codon_end+1].lower(), cds_codon_start, transcript_position_start, transcript_position_end, reference_allele_stranded, variant_type) if variant_type == "INS" and tx.get_strand() == "-": mutated_codon_seq = TranscriptProviderUtils.mutate_reference_sequence(reference_codon_seq.lower(), cds_codon_start - 1, transcript_position_start, transcript_position_end, observed_allele_stranded, variant_type) else: mutated_codon_seq = TranscriptProviderUtils.mutate_reference_sequence(reference_codon_seq.lower(), cds_codon_start, transcript_position_start, transcript_position_end, observed_allele_stranded, variant_type) observed_aa = Bio.Seq.translate(mutated_codon_seq) if ref_tx_seq_has_been_changed: reference_aa = Bio.Seq.translate(reference_codon_seq) else: reference_aa = protein_seq[protein_position_start-1:protein_position_end] if variant_type != VariantClassification.VT_SNP: try: reference_aa, observed_aa, protein_position_start, protein_position_end = \ self._adjust_protein_position_and_alleles(protein_seq, protein_position_start, protein_position_end, reference_aa, observed_aa) except InvalidVariantException as ive: logging.getLogger(__name__).error("Could not properly adjust protein position for variant: %s, %s, %s, %s, %s VT: %s" % (tx.get_contig(), start, end, ref_allele, alt_allele, variant_type)) logging.getLogger(__name__).error(str(ive)) logging.getLogger(__name__).warn("Above error may not have exact start and end positions if this is a VCF input.") logging.getLogger(__name__).warn("Variant type is likely incorrect. This can happen with some GATK VCFs") logging.getLogger(__name__).warn(TranscriptProviderUtils.is_valid_xNP(variant_type, ref_allele, alt_allele)) logging.getLogger(__name__).warn("The protein_change annotation may not be properly rendered.") vc_tmp, vc_tmp_secondary = self.infer_variant_classification(variant_type, reference_aa, observed_aa, ref_allele, alt_allele, is_frameshift_indel=is_frameshift_indel, is_splice_site=is_splice_site, is_start_codon=is_start_codon) cds_start_exon_space, cds_end_exon_space = TranscriptProviderUtils.determine_cds_in_exon_space(tx) exon_i = TranscriptProviderUtils.determine_exon_index(int(start), int(end), tx, variant_type) final_vc = VariantClassification(vc_tmp, variant_type, transcript_id=tx.get_transcript_id(), alt_codon=mutated_codon_seq, ref_codon=reference_codon_seq, ref_aa=reference_aa, ref_protein_start=protein_position_start, ref_protein_end=protein_position_end, alt_aa=observed_aa, alt_codon_start_in_exon=cds_codon_start, alt_codon_end_in_exon=cds_codon_end, ref_codon_start_in_exon=cds_codon_start, ref_codon_end_in_exon=cds_codon_end, cds_start_in_exon_space=cds_start_exon_space, ref_allele_stranded=reference_allele_stranded, alt_allele_stranded=observed_allele_stranded, exon_i=exon_i, vc_secondary=vc_tmp_secondary) return final_vc
def _determine_vc_for_cds_overlap(self, start, end, ref_allele, alt_allele, is_frameshift_indel, is_splice_site, tx, variant_type, is_start_codon): """ Note: This method can also handle start and stop codons. :param start: :param end: :param ref_allele: :param alt_allele: :param is_frameshift_indel: :param is_splice_site: :param tx: :param variant_type: :return: """ observed_allele_stranded, reference_allele_stranded = self._get_stranded_alleles( ref_allele, alt_allele, tx) transcript_position_start, transcript_position_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space( start, end, tx) if tx.get_strand( ) == "+" and not variant_type == VariantClassification.VT_INS: transcript_position_start -= 1 transcript_position_end -= 1 transcript_seq = tx.get_seq() protein_seq = tx.get_protein_seq() cds_start, cds_stop = TranscriptProviderUtils.determine_cds_in_exon_space( tx) protein_position_start, protein_position_end = TranscriptProviderUtils.get_protein_positions( transcript_position_start, transcript_position_end, cds_start) new_ref_transcript_seq = transcript_seq if (transcript_seq[transcript_position_start:transcript_position_end + 1] != reference_allele_stranded ) and variant_type != VariantClassification.VT_INS: new_ref_transcript_seq = list(transcript_seq) new_ref_transcript_seq[ transcript_position_start:transcript_position_end + 1] = reference_allele_stranded new_ref_transcript_seq = ''.join(new_ref_transcript_seq) ref_tx_seq_has_been_changed = True else: ref_tx_seq_has_been_changed = False cds_codon_start, cds_codon_end = TranscriptProviderUtils.get_cds_codon_positions( protein_position_start, protein_position_end, cds_start) if variant_type == "DEL": reference_codon_seq = new_ref_transcript_seq[ cds_codon_start:cds_codon_end + 1].lower() else: reference_codon_seq = TranscriptProviderUtils.mutate_reference_sequence( new_ref_transcript_seq[cds_codon_start:cds_codon_end + 1].lower(), cds_codon_start, transcript_position_start, transcript_position_end, reference_allele_stranded, variant_type) if variant_type == "INS" and tx.get_strand() == "-": mutated_codon_seq = TranscriptProviderUtils.mutate_reference_sequence( reference_codon_seq.lower(), cds_codon_start - 1, transcript_position_start, transcript_position_end, observed_allele_stranded, variant_type) else: mutated_codon_seq = TranscriptProviderUtils.mutate_reference_sequence( reference_codon_seq.lower(), cds_codon_start, transcript_position_start, transcript_position_end, observed_allele_stranded, variant_type) observed_aa = MutUtils.translate_sequence(mutated_codon_seq) if ref_tx_seq_has_been_changed: reference_aa = MutUtils.translate_sequence(reference_codon_seq) else: reference_aa = protein_seq[protein_position_start - 1:protein_position_end] if variant_type != VariantClassification.VT_SNP: try: reference_aa, observed_aa, protein_position_start, protein_position_end = \ self._adjust_protein_position_and_alleles(protein_seq, protein_position_start, protein_position_end, reference_aa, observed_aa) except InvalidVariantException as ive: logging.getLogger(__name__).error( "Could not properly adjust protein position for variant: %s, %s, %s, %s, %s VT: %s" % (tx.get_contig(), start, end, ref_allele, alt_allele, variant_type)) logging.getLogger(__name__).error(str(ive)) logging.getLogger(__name__).warn( "Above error may not have exact start and end positions if this is a VCF input." ) logging.getLogger(__name__).warn( "Variant type is likely incorrect. This can happen with some GATK VCFs" ) logging.getLogger(__name__).warn( TranscriptProviderUtils.is_valid_xNP( variant_type, ref_allele, alt_allele)) logging.getLogger(__name__).warn( "The protein_change annotation may not be properly rendered." ) vc_tmp, vc_tmp_secondary = self.infer_variant_classification( variant_type, reference_aa, observed_aa, ref_allele, alt_allele, is_frameshift_indel=is_frameshift_indel, is_splice_site=is_splice_site, is_start_codon=is_start_codon) cds_start_exon_space, cds_end_exon_space = TranscriptProviderUtils.determine_cds_in_exon_space( tx) exon_i = TranscriptProviderUtils.determine_exon_index( int(start), int(end), tx, variant_type) final_vc = VariantClassification( vc_tmp, variant_type, transcript_id=tx.get_transcript_id(), alt_codon=mutated_codon_seq, ref_codon=reference_codon_seq, ref_aa=reference_aa, ref_protein_start=protein_position_start, ref_protein_end=protein_position_end, alt_aa=observed_aa, alt_codon_start_in_exon=cds_codon_start, alt_codon_end_in_exon=cds_codon_end, ref_codon_start_in_exon=cds_codon_start, ref_codon_end_in_exon=cds_codon_end, cds_start_in_exon_space=cds_start_exon_space, ref_allele_stranded=reference_allele_stranded, alt_allele_stranded=observed_allele_stranded, exon_i=exon_i, vc_secondary=vc_tmp_secondary) return final_vc