def c_to_p(self, var_c, pro_ac=None): """ Converts a c. SequenceVariant to a p. SequenceVariant on the specified protein accession Author: Rudy Rico :param SequenceVariant var_c: hgvsc tag :param str pro_ac: protein accession :rtype: hgvs.sequencevariant.SequenceVariant """ if not (var_c.type == "c"): raise HGVSInvalidVariantError("Expected a cDNA (c.) variant; got " + str(var_c)) if self._validator: self._validator.validate(var_c) reference_data = RefTranscriptData(self.hdp, var_c.ac, pro_ac) builder = altseqbuilder.AltSeqBuilder(var_c, reference_data) # TODO: handle case where you get 2+ alt sequences back; # currently get list of 1 element loop structure implemented # to handle this, but doesn't really do anything currently. all_alt_data = builder.build_altseq() var_ps = [] for alt_data in all_alt_data: builder = altseq_to_hgvsp.AltSeqToHgvsp(reference_data, alt_data) var_p = builder.build_hgvsp() var_ps.append(var_p) var_p = var_ps[0] if self.add_gene_symbol: self._update_gene_symbol(var_p, var_c.gene) return var_p
def c_to_p(self, var_c, pro_ac=None): """ Converts a c. SequenceVariant to a p. SequenceVariant on the specified protein accession Author: Rudy Rico :param SequenceVariant var_c: hgvsc tag :param str pro_ac: protein accession :rtype: hgvs.sequencevariant.SequenceVariant """ @attr.s(slots=True) class RefTranscriptData(object): transcript_sequence = attr.ib() aa_sequence = attr.ib() cds_start = attr.ib() cds_stop = attr.ib() protein_accession = attr.ib() @classmethod def setup_transcript_data(cls, hdp, tx_ac, pro_ac): """helper for generating RefTranscriptData from for c_to_p""" tx_info = hdp.get_tx_identity_info(var_c.ac) tx_seq = hdp.get_seq(tx_ac) if tx_info is None or tx_seq is None: raise HGVSDataNotAvailableError("Missing transcript data for accession: {}".format(tx_ac)) # use 1-based hgvs coords cds_start = tx_info["cds_start_i"] + 1 cds_stop = tx_info["cds_end_i"] # padding list so biopython won't complain during the conversion tx_seq_to_translate = tx_seq[cds_start - 1:cds_stop] if len(tx_seq_to_translate) % 3 != 0: "".join(list(tx_seq_to_translate).extend(["N"] * ((3 - len(tx_seq_to_translate) % 3) % 3))) tx_seq_cds = Seq(tx_seq_to_translate) protein_seq = str(tx_seq_cds.translate()) if pro_ac is None: # get_acs... will always return at least the MD5_ accession pro_ac = (hdp.get_pro_ac_for_tx_ac(tx_ac) or hdp.get_acs_for_protein_seq(protein_seq)[0]) transcript_data = RefTranscriptData(tx_seq, protein_seq, cds_start, cds_stop, pro_ac) return transcript_data if not (var_c.type == "c"): raise HGVSInvalidVariantError("Expected a cDNA (c.); got " + str(var_c)) if self._validator: self._validator.validate(var_c) reference_data = RefTranscriptData.setup_transcript_data(self.hdp, var_c.ac, pro_ac) builder = altseqbuilder.AltSeqBuilder(var_c, reference_data) # TODO: handle case where you get 2+ alt sequences back; # currently get list of 1 element loop structure implemented # to handle this, but doesn't really do anything currently. all_alt_data = builder.build_altseq() var_ps = [] for alt_data in all_alt_data: builder = altseq_to_hgvsp.AltSeqToHgvsp(reference_data, alt_data) var_p = builder.build_hgvsp() var_ps.append(var_p) var_p = var_ps[0] return var_p