Exemplo n.º 1
0
    def generate_transcript_change_from_tx(self, tx, variant_type, vc, start_genomic_space, end_genomic_space, ref_allele, alt_allele):
        """

        :param vc:
        :return:
        """

        if vc.get_vc() == VariantClassification.SPLICE_SITE and vc.get_secondary_vc() == VariantClassification.INTRON:
            return ""
            # dist_from_exon = self._get_splice_site_coordinates(tx, start_genomic_space, end_genomic_space, vc.get_exon_i())
            # exon_i = vc.get_exon_i()
            # return TranscriptProviderUtils.render_splice_site_transcript_change(tx, dist_from_exon, exon_i, vc.get_secondary_vc() == VariantClassification.INTRON)

        if vc.get_cds_start_in_exon_space() == "" or vc.get_cds_start_in_exon_space() < 0:
            return ""
        exon_position_start,exon_position_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space(int(start_genomic_space), int(end_genomic_space), tx)

        if tx.get_strand() == "-":
            cds_position_start_cds_space = exon_position_start - int(vc.get_cds_start_in_exon_space())+1
            cds_position_end_cds_space = exon_position_end - int(vc.get_cds_start_in_exon_space())+1
        else:
            cds_position_start_cds_space = exon_position_start - int(vc.get_cds_start_in_exon_space())
            cds_position_end_cds_space = exon_position_end - int(vc.get_cds_start_in_exon_space())

        observed_allele_stranded, reference_allele_stranded = self._get_stranded_alleles(ref_allele, alt_allele, tx)
        result = TranscriptProviderUtils.render_transcript_change(variant_type, vc.get_vc(), cds_position_start_cds_space, cds_position_end_cds_space, reference_allele_stranded, observed_allele_stranded, vc.get_secondary_vc())
        return result
    def test_seq(self, start, end, gt):
        """Test that we can successfully determine the codon at an arbitrary location on test transcript"""
        tx = self.retrieve_test_transcript_MAPK1()

        transcript_position_start, transcript_position_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space(start, end, tx)
        transcript_seq = tx.get_seq()
        seq = transcript_seq[transcript_position_start:transcript_position_end+1]
        self.assertTrue(seq == gt, "Incorrect seq found guess,gt (%s, %s)" %(seq, gt))
    def test_codon_single_base(self, start, end, ref_base_stranded, gt_codon):
        """Test that we can grab the proper three bases of a codon for an arbitrary single base """
        tx = self.retrieve_test_transcript_MAPK1()
        transcript_position_start, transcript_position_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space(start, end, tx)
        cds_start, cds_stop = TranscriptProviderUtils.determine_cds_in_exon_space(tx)
        protein_position_start, protein_position_end = TranscriptProviderUtils.get_protein_positions(transcript_position_start, transcript_position_end, cds_start)
        cds_codon_start, cds_codon_end = TranscriptProviderUtils.get_cds_codon_positions(protein_position_start, protein_position_end, cds_start)

        codon_seq = tx.get_seq()[cds_codon_start:cds_codon_end+1]
        self.assertTrue(codon_seq == gt_codon, "Did not get correct codon (%s): %s    loc: %s-%s" %(gt_codon, codon_seq, start, end))
    def test_convert_genomic_space_to_exon_space(self, loc, gt_d):
        """Test genomic --> exon transform on real data. """
        gencode_input_gtf = "testdata/gencode/MAPK1.gencode.v18.annotation.gtf"
        gencode_input_fasta = "testdata/gencode/MAPK1.gencode.v18.pc_transcripts.fa"
        base_output_filename = "out/test_variant_classification"
        shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True)

        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.construct_ensembl_indices([gencode_input_gtf], [gencode_input_fasta], base_output_filename)
        ensembl_ds = EnsemblTranscriptDatasource(base_output_filename, version="TEST")
        tx = ensembl_ds.get_overlapping_transcripts("22", "22108790", "22108790")

        start, end = TranscriptProviderUtils.convert_genomic_space_to_exon_space(loc[0], loc[1], tx[0])
        loc_length = (int(loc[1]) - int(loc[0]))
        self.assertTrue((end - start) == loc_length, str(end) + " - " + str(start) + " was not correct length: " + str(loc_length))
        self.assertTrue(start == gt_d, "start position (" + str(start) + ") did not match gt (" + str(end) + ")" + "   exons: " + str(tx[0].get_exons()))
Exemplo n.º 5
0
    def test_querying_transcripts_by_region(self):
        """Test web api backend call /transcripts/.... """
        datasource_list = DatasourceFactory.createDatasources(
            self._determine_db_dir(), "hg19", isMulticore=False)
        annotator = Annotator()
        for ds in datasource_list:
            annotator.addDatasource(ds)
        txs = annotator.retrieve_transcripts_by_region("4", 50164411, 60164411)
        self.assertTranscriptsFound(txs)

        ## Here is an example of getting enough data to populate the json in doc/transcript_json_commented.json.txt
        # None of these values are validated.
        for tx in txs:
            transcript_id = tx.get_transcript_id()
            tx_start = tx.determine_transcript_start()
            tx_end = tx.determine_transcript_stop()
            gene = tx.get_gene()
            chr = tx.get_contig()
            n_exons = len(tx.get_exons())
            strand = tx.get_strand()
            footprint_start, footprint_end = tx.determine_cds_footprint()
            klass = tx.get_gene_type()
            cds_start = tx.determine_cds_start()
            cds_end = tx.determine_cds_stop()
            id = tx.get_gene_id()
            genomic_coords = [[exon[0], exon[1]] for exon in tx.get_exons()]
            transcript_coords = [[
                TranscriptProviderUtils.convert_genomic_space_to_exon_space(
                    exon[0] + 1, exon[1], tx)
            ] for exon in tx.get_exons()]
            code_len = int(cds_end) - int(cds_start) + 1

            # If refseq datasources are not available, this will fail.
            # Step 2 annotate the transcript, which produces a dummy mutation with the refseq annotations.
            dummy_mut = annotator.annotate_transcript(tx)
            refseq_mRNA_id = dummy_mut["gencode_xref_refseq_mRNA_id"]
            refseq_prot_id = dummy_mut["gencode_xref_refseq_prot_acc"]

            # Description is unavailable right now
            description = ""

            self.assertTrue(refseq_mRNA_id is not None)
            self.assertTrue(refseq_prot_id is not None)
            self.assertTrue(len(transcript_coords) == n_exons)
Exemplo n.º 6
0
    def test_querying_transcripts_by_region(self):
        """Test web api backend call /transcripts/.... """
        datasource_list = DatasourceFactory.createDatasources(self._determine_db_dir(), "hg19", isMulticore=False)
        annotator = Annotator()
        for ds in datasource_list:
            annotator.addDatasource(ds)
        txs = annotator.retrieve_transcripts_by_region("4", 50164411, 60164411)
        self.assertTranscriptsFound(txs)

        ## Here is an example of getting enough data to populate the json in doc/transcript_json_commented.json.txt
        # None of these values are validated.
        for tx in txs:
            transcript_id = tx.get_transcript_id()
            tx_start = tx.determine_transcript_start()
            tx_end = tx.determine_transcript_stop()
            gene = tx.get_gene()
            chr = tx.get_contig()
            n_exons = len(tx.get_exons())
            strand = tx.get_strand()
            footprint_start, footprint_end = tx.determine_cds_footprint()
            klass = tx.get_gene_type()
            cds_start = tx.determine_cds_start()
            cds_end = tx.determine_cds_stop()
            id = tx.get_gene_id()
            genomic_coords = [[exon[0], exon[1]] for exon in tx.get_exons()]
            transcript_coords = [
                [TranscriptProviderUtils.convert_genomic_space_to_exon_space(exon[0] + 1, exon[1], tx)]
                for exon in tx.get_exons()
            ]
            code_len = int(cds_end) - int(cds_start) + 1

            # If refseq datasources are not available, this will fail.
            # Step 2 annotate the transcript, which produces a dummy mutation with the refseq annotations.
            dummy_mut = annotator.annotate_transcript(tx)
            refseq_mRNA_id = dummy_mut["gencode_xref_refseq_mRNA_id"]
            refseq_prot_id = dummy_mut["gencode_xref_refseq_prot_acc"]

            # Description is unavailable right now
            description = ""

            self.assertTrue(refseq_mRNA_id is not None)
            self.assertTrue(refseq_prot_id is not None)
            self.assertTrue(len(transcript_coords) == n_exons)
Exemplo n.º 7
0
    def generate_transcript_change_from_tx(self, tx, variant_type, vc,
                                           start_genomic_space,
                                           end_genomic_space, ref_allele,
                                           alt_allele):
        """

        :param vc:
        :return:
        """

        if vc.get_vc(
        ) == VariantClassification.SPLICE_SITE and vc.get_secondary_vc(
        ) == VariantClassification.INTRON:
            return ""
            # dist_from_exon = self._get_splice_site_coordinates(tx, start_genomic_space, end_genomic_space, vc.get_exon_i())
            # exon_i = vc.get_exon_i()
            # return TranscriptProviderUtils.render_splice_site_transcript_change(tx, dist_from_exon, exon_i, vc.get_secondary_vc() == VariantClassification.INTRON)

        if vc.get_cds_start_in_exon_space(
        ) == "" or vc.get_cds_start_in_exon_space() < 0:
            return ""
        exon_position_start, exon_position_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space(
            int(start_genomic_space), int(end_genomic_space), tx)

        if tx.get_strand() == "-":
            cds_position_start_cds_space = exon_position_start - int(
                vc.get_cds_start_in_exon_space()) + 1
            cds_position_end_cds_space = exon_position_end - int(
                vc.get_cds_start_in_exon_space()) + 1
        else:
            cds_position_start_cds_space = exon_position_start - int(
                vc.get_cds_start_in_exon_space())
            cds_position_end_cds_space = exon_position_end - int(
                vc.get_cds_start_in_exon_space())

        observed_allele_stranded, reference_allele_stranded = self._get_stranded_alleles(
            ref_allele, alt_allele, tx)
        result = TranscriptProviderUtils.render_transcript_change(
            variant_type, vc.get_vc(), cds_position_start_cds_space,
            cds_position_end_cds_space, reference_allele_stranded,
            observed_allele_stranded, vc.get_secondary_vc())
        return result
Exemplo n.º 8
0
    def variant_classify(self, tx, ref_allele, alt_allele, start, end, variant_type, dist=2):
        """Perform classifications.

        Everything handled in genomic space

        *RNA*
        x'UTR
        Splice_Site (Intron)
        Intron
        Splice_Site (Exon)
        {Missense, Silent}
        {Nonsense, Silent}
        {Nonstop, Silent}
        IGR
        x'Flank
        De_novo_Start

        """
        gene_type = tx.get_gene_type()
        if gene_type != "protein_coding":
            if gene_type == VariantClassification.LINCRNA:
                return VariantClassification(VariantClassification.LINCRNA, variant_type, tx.get_transcript_id())
            else:
                return VariantClassification(VariantClassification.RNA, variant_type, tx.get_transcript_id())

        if ref_allele == "-":
            ref_allele = ""
        if alt_allele == "-":
            alt_allele = ""

        s = int(start)
        e = int(end)
        is_exon_overlap = TranscriptProviderUtils.determine_if_exon_overlap(s, e, tx, variant_type)

        is_splice_site_tuple = self._determine_if_splice_site_overlap(s, e, tx, variant_type, dist)
        is_splice_site = is_splice_site_tuple[0]

        is_beyond_exons, side, is_flank = self._determine_beyond_exon_info_vt(start, end, tx, variant_type)

        if not is_exon_overlap and not is_beyond_exons:
            exon_i = TranscriptProviderUtils.determine_closest_exon(tx, int(start), int(end))
            if is_splice_site:
                # Intron Splice Site
                return VariantClassification(VariantClassification.SPLICE_SITE, variant_type, tx.get_transcript_id(), vc_secondary=VariantClassification.INTRON, exon_i=exon_i)
            else:
                return VariantClassification(VariantClassification.INTRON, variant_type, tx.get_transcript_id(), exon_i=exon_i)

        if not is_exon_overlap and is_beyond_exons:
            if is_flank:
                # Flanks
                if side.startswith("3"):
                    return VariantClassification(VariantClassification.THREE_PRIME_PRIME_FLANK, variant_type, transcript_id=tx.get_transcript_id())
                else:
                    return VariantClassification(VariantClassification.FIVE_PRIME_PRIME_FLANK, variant_type, transcript_id=tx.get_transcript_id())

            else:
                # IGR
                return VariantClassification(VariantClassification.IGR, variant_type)

        is_start_codon_overlap = self._determine_codon_overlap(s, e, tx.get_start_codon(), variant_type)
        is_stop_codon_overlap = self._determine_codon_overlap(s, e, tx.get_stop_codon(), variant_type)

        if is_start_codon_overlap and not variant_type.endswith("NP"):
            return VariantClassification('Start_Codon_' + variant_type.capitalize(), variant_type, transcript_id=tx.get_transcript_id())
        if is_stop_codon_overlap and not variant_type.endswith("NP"):
            return VariantClassification('Stop_Codon_' + variant_type.capitalize(), variant_type, transcript_id=tx.get_transcript_id())

        is_cds_overlap = self._determine_if_cds_overlap(s, e, tx, variant_type)
        if is_exon_overlap and not is_cds_overlap and not is_start_codon_overlap and not is_stop_codon_overlap:
            # UTR
            if side.startswith("3"):
                vc_tmp = VariantClassification.THREE_PRIME_UTR
            else:
                vc_tmp = VariantClassification.FIVE_PRIME_UTR
            transcript_position_exon_space_start, transcript_position_exon_space_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space(start, end, tx)
            vc = self._determine_de_novo(vc_tmp, transcript_position_exon_space_start, ref_allele, alt_allele, tx, variant_type)
            return VariantClassification(vc, variant_type, transcript_id=tx.get_transcript_id(), )

        # We have a clean overlap in the CDS.  Includes start codon or stop codon.
        if is_cds_overlap or is_stop_codon_overlap or is_start_codon_overlap:
            is_frameshift_indel = self.is_frameshift_indel(variant_type, int(start), int(end), alt_allele)
            return self._determine_vc_for_cds_overlap(start, end, ref_allele, alt_allele, is_frameshift_indel, is_splice_site, tx, variant_type, is_start_codon_overlap)

        raise ValueError("Could not determine variant classification:  " + tx.get_trancript_id() + " " + str([ref_allele, alt_allele, start, end]))
Exemplo n.º 9
0
    def _determine_vc_for_cds_overlap(self, start, end, ref_allele, alt_allele, is_frameshift_indel, is_splice_site, tx, variant_type, is_start_codon):
        """
        Note: This method can also handle start and stop codons.

        :param start:
        :param end:
        :param ref_allele:
        :param alt_allele:
        :param is_frameshift_indel:
        :param is_splice_site:
        :param tx:
        :param variant_type:
        :return:
        """
        observed_allele_stranded, reference_allele_stranded = self._get_stranded_alleles(ref_allele, alt_allele, tx)
        transcript_position_start, transcript_position_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space(
            start, end, tx)

        if tx.get_strand() == "+" and not variant_type == VariantClassification.VT_INS:
            transcript_position_start -= 1
            transcript_position_end -= 1

        transcript_seq = tx.get_seq()
        protein_seq = tx.get_protein_seq()
        cds_start, cds_stop = TranscriptProviderUtils.determine_cds_in_exon_space(tx)
        protein_position_start, protein_position_end = TranscriptProviderUtils.get_protein_positions(
            transcript_position_start,
            transcript_position_end, cds_start)
        new_ref_transcript_seq = transcript_seq
        if (transcript_seq[transcript_position_start:transcript_position_end+1] != reference_allele_stranded) and variant_type != VariantClassification.VT_INS:
            new_ref_transcript_seq = list(transcript_seq)
            new_ref_transcript_seq[transcript_position_start:transcript_position_end+1] = reference_allele_stranded
            new_ref_transcript_seq = ''.join(new_ref_transcript_seq)
            ref_tx_seq_has_been_changed = True
        else:
            ref_tx_seq_has_been_changed = False
        cds_codon_start, cds_codon_end = TranscriptProviderUtils.get_cds_codon_positions(protein_position_start, protein_position_end, cds_start)

        if variant_type == "DEL":
            reference_codon_seq = new_ref_transcript_seq[cds_codon_start:cds_codon_end+1].lower()
        else:
            reference_codon_seq = TranscriptProviderUtils.mutate_reference_sequence(new_ref_transcript_seq[cds_codon_start:cds_codon_end+1].lower(), cds_codon_start, transcript_position_start, transcript_position_end, reference_allele_stranded, variant_type)

        if variant_type == "INS" and tx.get_strand() == "-":
            mutated_codon_seq = TranscriptProviderUtils.mutate_reference_sequence(reference_codon_seq.lower(), cds_codon_start - 1, transcript_position_start, transcript_position_end, observed_allele_stranded, variant_type)
        else:
            mutated_codon_seq = TranscriptProviderUtils.mutate_reference_sequence(reference_codon_seq.lower(), cds_codon_start, transcript_position_start, transcript_position_end, observed_allele_stranded, variant_type)


        observed_aa = Bio.Seq.translate(mutated_codon_seq)
        if ref_tx_seq_has_been_changed:
            reference_aa = Bio.Seq.translate(reference_codon_seq)
        else:
            reference_aa = protein_seq[protein_position_start-1:protein_position_end]

        if variant_type != VariantClassification.VT_SNP:

            try:
                reference_aa, observed_aa, protein_position_start, protein_position_end = \
                    self._adjust_protein_position_and_alleles(protein_seq, protein_position_start,
                        protein_position_end, reference_aa, observed_aa)
            except InvalidVariantException as ive:
                logging.getLogger(__name__).error("Could not properly adjust protein position for variant: %s, %s, %s, %s, %s VT: %s" % (tx.get_contig(), start, end, ref_allele, alt_allele, variant_type))
                logging.getLogger(__name__).error(str(ive))
                logging.getLogger(__name__).warn("Above error may not have exact start and end positions if this is a VCF input.")
                logging.getLogger(__name__).warn("Variant type is likely incorrect.  This can happen with some GATK VCFs")
                logging.getLogger(__name__).warn(TranscriptProviderUtils.is_valid_xNP(variant_type, ref_allele, alt_allele))
                logging.getLogger(__name__).warn("The protein_change annotation may not be properly rendered.")

        vc_tmp, vc_tmp_secondary = self.infer_variant_classification(variant_type, reference_aa, observed_aa, ref_allele, alt_allele,
                                                   is_frameshift_indel=is_frameshift_indel, is_splice_site=is_splice_site, is_start_codon=is_start_codon)

        cds_start_exon_space, cds_end_exon_space = TranscriptProviderUtils.determine_cds_in_exon_space(tx)
        exon_i = TranscriptProviderUtils.determine_exon_index(int(start), int(end), tx, variant_type)
        final_vc = VariantClassification(vc_tmp, variant_type, transcript_id=tx.get_transcript_id(), alt_codon=mutated_codon_seq, ref_codon=reference_codon_seq, ref_aa=reference_aa, ref_protein_start=protein_position_start, ref_protein_end=protein_position_end, alt_aa=observed_aa, alt_codon_start_in_exon=cds_codon_start, alt_codon_end_in_exon=cds_codon_end, ref_codon_start_in_exon=cds_codon_start, ref_codon_end_in_exon=cds_codon_end, cds_start_in_exon_space=cds_start_exon_space, ref_allele_stranded=reference_allele_stranded, alt_allele_stranded=observed_allele_stranded, exon_i=exon_i, vc_secondary=vc_tmp_secondary)
        return final_vc
Exemplo n.º 10
0
    def variant_classify(self,
                         tx,
                         ref_allele,
                         alt_allele,
                         start,
                         end,
                         variant_type,
                         dist=2):
        """Perform classifications.

        Everything handled in genomic space

        *RNA*
        x'UTR
        Splice_Site (Intron)
        Intron
        Splice_Site (Exon)
        {Missense, Silent}
        {Nonsense, Silent}
        {Nonstop, Silent}
        IGR
        x'Flank
        De_novo_Start

        """
        gene_type = tx.get_gene_type()
        if gene_type != "protein_coding":
            if gene_type == VariantClassification.LINCRNA:
                return VariantClassification(VariantClassification.LINCRNA,
                                             variant_type,
                                             tx.get_transcript_id())
            else:
                return VariantClassification(VariantClassification.RNA,
                                             variant_type,
                                             tx.get_transcript_id())

        if ref_allele == "-":
            ref_allele = ""
        if alt_allele == "-":
            alt_allele = ""

        s = int(start)
        e = int(end)
        is_exon_overlap = TranscriptProviderUtils.determine_if_exon_overlap(
            s, e, tx, variant_type)

        is_splice_site_tuple = self._determine_if_splice_site_overlap(
            s, e, tx, variant_type, dist)
        is_splice_site = is_splice_site_tuple[0]

        is_beyond_exons, side, is_flank = self._determine_beyond_exon_info_vt(
            start, end, tx, variant_type)

        if not is_exon_overlap and not is_beyond_exons:
            exon_i = TranscriptProviderUtils.determine_closest_exon(
                tx, int(start), int(end))
            if is_splice_site:
                # Intron Splice Site
                return VariantClassification(
                    VariantClassification.SPLICE_SITE,
                    variant_type,
                    tx.get_transcript_id(),
                    vc_secondary=VariantClassification.INTRON,
                    exon_i=exon_i)
            else:
                return VariantClassification(VariantClassification.INTRON,
                                             variant_type,
                                             tx.get_transcript_id(),
                                             exon_i=exon_i)

        if not is_exon_overlap and is_beyond_exons:
            if is_flank:
                # Flanks
                if side.startswith("3"):
                    return VariantClassification(
                        VariantClassification.THREE_PRIME_PRIME_FLANK,
                        variant_type,
                        transcript_id=tx.get_transcript_id())
                else:
                    return VariantClassification(
                        VariantClassification.FIVE_PRIME_PRIME_FLANK,
                        variant_type,
                        transcript_id=tx.get_transcript_id())

            else:
                # IGR
                return VariantClassification(VariantClassification.IGR,
                                             variant_type)

        is_start_codon_overlap = self._determine_codon_overlap(
            s, e, tx.get_start_codon(), variant_type)
        is_stop_codon_overlap = self._determine_codon_overlap(
            s, e, tx.get_stop_codon(), variant_type)

        if is_start_codon_overlap and not variant_type.endswith("NP"):
            return VariantClassification('Start_Codon_' +
                                         variant_type.capitalize(),
                                         variant_type,
                                         transcript_id=tx.get_transcript_id())
        if is_stop_codon_overlap and not variant_type.endswith("NP"):
            return VariantClassification('Stop_Codon_' +
                                         variant_type.capitalize(),
                                         variant_type,
                                         transcript_id=tx.get_transcript_id())

        is_cds_overlap = self._determine_if_cds_overlap(s, e, tx, variant_type)
        if is_exon_overlap and not is_cds_overlap and not is_start_codon_overlap and not is_stop_codon_overlap:
            # UTR
            if side.startswith("3"):
                vc_tmp = VariantClassification.THREE_PRIME_UTR
            else:
                vc_tmp = VariantClassification.FIVE_PRIME_UTR
            transcript_position_exon_space_start, transcript_position_exon_space_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space(
                start, end, tx)
            vc = self._determine_de_novo(vc_tmp,
                                         transcript_position_exon_space_start,
                                         ref_allele, alt_allele, tx,
                                         variant_type)
            return VariantClassification(
                vc,
                variant_type,
                transcript_id=tx.get_transcript_id(),
            )

        # We have a clean overlap in the CDS.  Includes start codon or stop codon.
        if is_cds_overlap or is_stop_codon_overlap or is_start_codon_overlap:
            is_frameshift_indel = self.is_frameshift_indel(
                variant_type, int(start), int(end), alt_allele)
            return self._determine_vc_for_cds_overlap(
                start, end, ref_allele, alt_allele, is_frameshift_indel,
                is_splice_site, tx, variant_type, is_start_codon_overlap)

        raise ValueError("Could not determine variant classification:  " +
                         tx.get_trancript_id() + " " +
                         str([ref_allele, alt_allele, start, end]))
Exemplo n.º 11
0
    def _determine_vc_for_cds_overlap(self, start, end, ref_allele, alt_allele,
                                      is_frameshift_indel, is_splice_site, tx,
                                      variant_type, is_start_codon):
        """
        Note: This method can also handle start and stop codons.

        :param start:
        :param end:
        :param ref_allele:
        :param alt_allele:
        :param is_frameshift_indel:
        :param is_splice_site:
        :param tx:
        :param variant_type:
        :return:
        """
        observed_allele_stranded, reference_allele_stranded = self._get_stranded_alleles(
            ref_allele, alt_allele, tx)
        transcript_position_start, transcript_position_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space(
            start, end, tx)

        if tx.get_strand(
        ) == "+" and not variant_type == VariantClassification.VT_INS:
            transcript_position_start -= 1
            transcript_position_end -= 1

        transcript_seq = tx.get_seq()
        protein_seq = tx.get_protein_seq()
        cds_start, cds_stop = TranscriptProviderUtils.determine_cds_in_exon_space(
            tx)
        protein_position_start, protein_position_end = TranscriptProviderUtils.get_protein_positions(
            transcript_position_start, transcript_position_end, cds_start)
        new_ref_transcript_seq = transcript_seq
        if (transcript_seq[transcript_position_start:transcript_position_end +
                           1] != reference_allele_stranded
            ) and variant_type != VariantClassification.VT_INS:
            new_ref_transcript_seq = list(transcript_seq)
            new_ref_transcript_seq[
                transcript_position_start:transcript_position_end +
                1] = reference_allele_stranded
            new_ref_transcript_seq = ''.join(new_ref_transcript_seq)
            ref_tx_seq_has_been_changed = True
        else:
            ref_tx_seq_has_been_changed = False
        cds_codon_start, cds_codon_end = TranscriptProviderUtils.get_cds_codon_positions(
            protein_position_start, protein_position_end, cds_start)

        if variant_type == "DEL":
            reference_codon_seq = new_ref_transcript_seq[
                cds_codon_start:cds_codon_end + 1].lower()
        else:
            reference_codon_seq = TranscriptProviderUtils.mutate_reference_sequence(
                new_ref_transcript_seq[cds_codon_start:cds_codon_end +
                                       1].lower(), cds_codon_start,
                transcript_position_start, transcript_position_end,
                reference_allele_stranded, variant_type)

        if variant_type == "INS" and tx.get_strand() == "-":
            mutated_codon_seq = TranscriptProviderUtils.mutate_reference_sequence(
                reference_codon_seq.lower(), cds_codon_start - 1,
                transcript_position_start, transcript_position_end,
                observed_allele_stranded, variant_type)
        else:
            mutated_codon_seq = TranscriptProviderUtils.mutate_reference_sequence(
                reference_codon_seq.lower(), cds_codon_start,
                transcript_position_start, transcript_position_end,
                observed_allele_stranded, variant_type)

        observed_aa = MutUtils.translate_sequence(mutated_codon_seq)
        if ref_tx_seq_has_been_changed:
            reference_aa = MutUtils.translate_sequence(reference_codon_seq)
        else:
            reference_aa = protein_seq[protein_position_start -
                                       1:protein_position_end]

        if variant_type != VariantClassification.VT_SNP:

            try:
                reference_aa, observed_aa, protein_position_start, protein_position_end = \
                    self._adjust_protein_position_and_alleles(protein_seq, protein_position_start,
                        protein_position_end, reference_aa, observed_aa)
            except InvalidVariantException as ive:
                logging.getLogger(__name__).error(
                    "Could not properly adjust protein position for variant: %s, %s, %s, %s, %s VT: %s"
                    % (tx.get_contig(), start, end, ref_allele, alt_allele,
                       variant_type))
                logging.getLogger(__name__).error(str(ive))
                logging.getLogger(__name__).warn(
                    "Above error may not have exact start and end positions if this is a VCF input."
                )
                logging.getLogger(__name__).warn(
                    "Variant type is likely incorrect.  This can happen with some GATK VCFs"
                )
                logging.getLogger(__name__).warn(
                    TranscriptProviderUtils.is_valid_xNP(
                        variant_type, ref_allele, alt_allele))
                logging.getLogger(__name__).warn(
                    "The protein_change annotation may not be properly rendered."
                )

        vc_tmp, vc_tmp_secondary = self.infer_variant_classification(
            variant_type,
            reference_aa,
            observed_aa,
            ref_allele,
            alt_allele,
            is_frameshift_indel=is_frameshift_indel,
            is_splice_site=is_splice_site,
            is_start_codon=is_start_codon)

        cds_start_exon_space, cds_end_exon_space = TranscriptProviderUtils.determine_cds_in_exon_space(
            tx)
        exon_i = TranscriptProviderUtils.determine_exon_index(
            int(start), int(end), tx, variant_type)
        final_vc = VariantClassification(
            vc_tmp,
            variant_type,
            transcript_id=tx.get_transcript_id(),
            alt_codon=mutated_codon_seq,
            ref_codon=reference_codon_seq,
            ref_aa=reference_aa,
            ref_protein_start=protein_position_start,
            ref_protein_end=protein_position_end,
            alt_aa=observed_aa,
            alt_codon_start_in_exon=cds_codon_start,
            alt_codon_end_in_exon=cds_codon_end,
            ref_codon_start_in_exon=cds_codon_start,
            ref_codon_end_in_exon=cds_codon_end,
            cds_start_in_exon_space=cds_start_exon_space,
            ref_allele_stranded=reference_allele_stranded,
            alt_allele_stranded=observed_allele_stranded,
            exon_i=exon_i,
            vc_secondary=vc_tmp_secondary)
        return final_vc