def __init__(self, hdp, tx_ac, alt_ac, alt_aln_method): self.tx_ac = tx_ac self.alt_ac = alt_ac self.alt_aln_method = alt_aln_method if self.alt_aln_method != "transcript": tx_info = hdp.get_tx_info(self.tx_ac, self.alt_ac, self.alt_aln_method) if tx_info is None: raise HGVSDataNotAvailableError( "AlignmentMapper(tx_ac={self.tx_ac}, " "alt_ac={self.alt_ac}, alt_aln_method={self.alt_aln_method}): " "No transcript info".format(self=self)) tx_exons = hdp.get_tx_exons(self.tx_ac, self.alt_ac, self.alt_aln_method) if tx_exons is None: raise HGVSDataNotAvailableError( "AlignmentMapper(tx_ac={self.tx_ac}, " "alt_ac={self.alt_ac}, alt_aln_method={self.alt_aln_method}): " "No transcript exons".format(self=self)) # hgvs-386: An assumption when building the cigar string # is that exons are adjacent. Assert that here. sorted_tx_exons = sorted(tx_exons, key=lambda e: e["ord"]) for i in range(1, len(sorted_tx_exons)): if sorted_tx_exons[ i - 1]["tx_end_i"] != sorted_tx_exons[i]["tx_start_i"]: raise HGVSDataNotAvailableError( "AlignmentMapper(tx_ac={self.tx_ac}, " "alt_ac={self.alt_ac}, alt_aln_method={self.alt_aln_method}): " "Exons {a} and {b} are not adjacent".format(self=self, a=i, b=i + 1)) self.strand = tx_exons[0]["alt_strand"] self.gc_offset = tx_exons[0]["alt_start_i"] self.cds_start_i = tx_info["cds_start_i"] self.cds_end_i = tx_info["cds_end_i"] self.cigar = build_tx_cigar(tx_exons, self.strand) self.ref_pos, self.tgt_pos, self.cigar_op = self._parse_cigar( self.cigar) self.tgt_len = self.tgt_pos[-1] else: # this covers the identity cases n <-> c tx_identity_info = hdp.get_tx_identity_info(self.tx_ac) if tx_identity_info is None: raise HGVSDataNotAvailableError( "AlignmentMapper(tx_ac={self.tx_ac}, " "alt_ac={self.alt_ac}, alt_aln_method={self.alt_aln_method}): " "No transcript identity info".format(self=self)) self.cds_start_i = tx_identity_info["cds_start_i"] self.cds_end_i = tx_identity_info["cds_end_i"] self.tgt_len = sum(tx_identity_info["lengths"]) assert not ( (self.cds_start_i is None) ^ (self.cds_end_i is None) ), "CDS start and end must both be defined or neither defined"
def setup_transcript_data(cls, hdp, tx_ac, pro_ac): """helper for generating RefTranscriptData from for c_to_p""" tx_info = hdp.get_tx_identity_info(var_c.ac) tx_seq = hdp.get_seq(tx_ac) if tx_info is None or tx_seq is None: raise HGVSDataNotAvailableError("Missing transcript data for accession: {}".format(tx_ac)) # use 1-based hgvs coords cds_start = tx_info["cds_start_i"] + 1 cds_stop = tx_info["cds_end_i"] # padding list so biopython won't complain during the conversion tx_seq_to_translate = tx_seq[cds_start - 1:cds_stop] if len(tx_seq_to_translate) % 3 != 0: "".join(list(tx_seq_to_translate).extend(["N"] * ((3 - len(tx_seq_to_translate) % 3) % 3))) tx_seq_cds = Seq(tx_seq_to_translate) protein_seq = str(tx_seq_cds.translate()) if pro_ac is None: # get_acs... will always return at least the MD5_ accession pro_ac = (hdp.get_pro_ac_for_tx_ac(tx_ac) or hdp.get_acs_for_protein_seq(protein_seq)[0]) transcript_data = RefTranscriptData(tx_seq, protein_seq, cds_start, cds_stop, pro_ac) return transcript_data
def _alt_ac_for_tx_ac(self, tx_ac): """return chromosomal accession for given transcript accession (and the_assembly and aln_method setting used to instantiate this AssemblyMapper) """ alt_acs = [ e["alt_ac"] for e in self.hdp.get_tx_mapping_options(tx_ac) if e["alt_aln_method"] == self.alt_aln_method and e["alt_ac"] in self._assembly_accessions ] if not alt_acs: raise HGVSDataNotAvailableError( "No alignments for {tx_ac} in {an} using {am}".format( tx_ac=tx_ac, an=self.assembly_name, am=self.alt_aln_method)) # TODO: conditional is unnecessary; remove if len(alt_acs) > 1: names = set(self._assembly_map[ac] for ac in alt_acs) if names != set("XY"): alts = ", ".join([ "{ac} ({n})".format(ac=ac, n=self._assembly_map[ac]) for ac in alt_acs ]) raise HGVSError( "Multiple chromosomal alignments for {tx_ac} in {an}" " using {am} (non-pseudoautosomal region) [{alts}]".format( tx_ac=tx_ac, an=self.assembly_name, am=self.alt_aln_method, alts=alts)) # assume PAR if self.in_par_assume is None: raise HGVSError( "Multiple chromosomal alignments for {tx_ac} in {an}" " using {am} (likely pseudoautosomal region)".format( tx_ac=tx_ac, an=self.assembly_name, am=self.alt_aln_method)) alt_acs = [ ac for ac in alt_acs if self._assembly_map[ac] == self.in_par_assume ] if len(alt_acs) != 1: raise HGVSError( "Multiple chromosomal alignments for {tx_ac} in {an}" " using {am}; in_par_assume={ipa} selected {n} of them". format(tx_ac=tx_ac, an=self.assembly_name, am=self.alt_aln_method, ipa=self.in_par_assume, n=len(alt_acs))) assert len( alt_acs) == 1, "Should have exactly one alignment at this point" return alt_acs[0]
def __init__(self, hdp, tx_ac, pro_ac): """helper for generating RefTranscriptData from for c_to_p""" tx_info = hdp.get_tx_identity_info(tx_ac) tx_seq = hdp.get_seq(tx_ac) if tx_info is None or tx_seq is None: raise HGVSDataNotAvailableError( "Missing transcript data for accession: {}".format(tx_ac)) # use 1-based hgvs coords cds_start = tx_info["cds_start_i"] + 1 cds_stop = tx_info["cds_end_i"] # coding sequences that are not divisable by 3 are not yet supported tx_seq_to_translate = tx_seq[cds_start - 1:cds_stop] if len(tx_seq_to_translate) % 3 != 0: raise NotImplementedError( "Transcript {} is not supported because its sequence length of {} is not divisible by 3." .format(tx_ac, len(tx_seq_to_translate))) protein_seq = translate_cds(tx_seq_to_translate) if pro_ac is None: # get_acs... will always return at least the MD5_ accession # TODO: drop get_acs_for_protein_seq; use known mapping or digest (wo/pro ac inference) pro_ac = (hdp.get_pro_ac_for_tx_ac(tx_ac) or hdp.get_acs_for_protein_seq(protein_seq)[0]) self.transcript_sequence = tx_seq self.aa_sequence = protein_seq self.cds_start = cds_start self.cds_stop = cds_stop self.protein_accession = pro_ac
def _ensure_schema_exists(self): # N.B. On AWS RDS, information_schema.schemata always returns zero rows r = self._fetchone("select exists(SELECT 1 FROM pg_namespace WHERE nspname = %s)", [self.url.schema]) if r[0]: return raise HGVSDataNotAvailableError("specified schema ({}) does not exist (url={})".format( self.url.schema, self.url))
def _get_tgt_length(self, var): """Get the total length of the whole reference sequence """ if var.type == "g" or var.type == "m": return float("inf") else: # Get genomic sequence access number for this transcript identity_info = self.hdp.get_tx_identity_info(var.ac) if not identity_info: raise HGVSDataNotAvailableError("No identity info available for {ac}".format(ac=var.ac)) tgt_len = sum(identity_info["lengths"]) return tgt_len
def _get_boundary(self, var): """Get the position of exon-intron boundary for current variant """ if var.type == "r" or var.type == "n": if self.cross_boundaries: return 0, float("inf") else: # Get genomic sequence access number for this transcript map_info = self.hdp.get_tx_mapping_options(var.ac) if not map_info: raise HGVSDataNotAvailableError( "No mapping info available for {ac}".format(ac=var.ac)) map_info = [ item for item in map_info if item["alt_aln_method"] == self.alt_aln_method ] alt_ac = map_info[0]["alt_ac"] # Get tx info tx_info = self.hdp.get_tx_info(var.ac, alt_ac, self.alt_aln_method) cds_start = tx_info["cds_start_i"] cds_end = tx_info["cds_end_i"] # Get exon info exon_info = self.hdp.get_tx_exons(var.ac, alt_ac, self.alt_aln_method) exon_starts = [exon["tx_start_i"] for exon in exon_info] exon_ends = [exon["tx_end_i"] for exon in exon_info] exon_starts.sort() exon_ends.sort() exon_starts.append(exon_ends[-1]) exon_ends.append(float("inf")) # Find the end pos of the exon where the var locates left = 0 right = float("inf") # TODO: #242: implement methods to find tx regions for i in range(0, len(exon_starts)): if (var.posedit.pos.start.base - 1 >= exon_starts[i] and var.posedit.pos.start.base - 1 < exon_ends[i]): break for j in range(0, len(exon_starts)): if (var.posedit.pos.end.base - 1 >= exon_starts[j] and var.posedit.pos.end.base - 1 < exon_ends[j]): break if i != j: raise HGVSUnsupportedOperationError( "Unsupported normalization of variants spanning the exon-intron boundary ({var})" .format(var=var)) left = exon_starts[i] right = exon_ends[i] if cds_start is None: pass elif var.posedit.pos.end.base - 1 < cds_start: right = min(right, cds_start) elif var.posedit.pos.start.base - 1 >= cds_start: left = max(left, cds_start) else: raise HGVSUnsupportedOperationError( "Unsupported normalization of variants spanning the UTR-exon boundary ({var})" .format(var=var)) if cds_end is None: pass elif var.posedit.pos.start.base - 1 >= cds_end: left = max(left, cds_end) elif var.posedit.pos.end.base - 1 < cds_end: right = min(right, cds_end) else: raise HGVSUnsupportedOperationError( "Unsupported normalization of variants spanning the exon-UTR boundary ({var})" .format(var=var)) return left, right else: # For variant type of g and m etc. return 0, float("inf")