def add_hgvs_allele(self, hgvs_allele): """parse and add the hgvs_allele to the bundle""" hp = _get_hgvs_parser() sv = hp.parse_hgvs_variant(hgvs_allele) sequence_id = get_vmc_sequence_identifier(sv.ac) self.identifiers[sequence_id].add(sv.ac) if isinstance(sv.posedit.pos, hgvs.location.BaseOffsetInterval): if sv.posedit.pos.start.is_intronic or sv.posedit.pos.end.is_intronic: raise ValueError("Intronic HGVS variants are not supported".format(sv.posedit.edit.type)) if sv.posedit.edit.type == 'ins': interval = models.Interval(start=sv.posedit.pos.start.base, end=sv.posedit.pos.start.base) state = sv.posedit.edit.alt elif sv.posedit.edit.type in ('sub', 'del', 'delins', 'identity'): interval = models.Interval(start=sv.posedit.pos.start.base - 1, end=sv.posedit.pos.end.base) if sv.posedit.edit.type == 'identity': state = get_reference_sequence(sv.ac, sv.posedit.pos.start.base - 1, sv.posedit.pos.end.base) else: state = sv.posedit.edit.alt or '' else: raise ValueError("HGVS variant type {} is unsupported".format(sv.posedit.edit.type)) location = models.Location(sequence_id=sequence_id, interval=interval) location.id = self._id_function(location) self.locations[location.id] = location allele = models.Allele(location_id=location.id, state=state) allele.id = self._id_function(allele) self.alleles[allele.id] = allele return allele
def from_hgvs(hgvs_string): hp = _get_hgvs_parser() sv = hp.parse_hgvs_variant(hgvs_string) ir = models.Identifier(namespace="NCBI", accession=sv.ac) sequence_id = "VMC:GS_Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO" #get_vmc_sequence_id(ir) if isinstance(sv.posedit.pos, hgvs.location.BaseOffsetInterval): if sv.posedit.pos.start.is_intronic or sv.posedit.pos.end.is_intronic: raise ValueError("Intronic HGVS variants are not supported".format( sv.posedit.edit.type)) if sv.posedit.edit.type == 'ins': interval = models.Interval(start=sv.posedit.pos.start.base, end=sv.posedit.pos.start.base) elif sv.posedit.edit.type in ('sub', 'del', 'delins'): interval = models.Interval(start=sv.posedit.pos.start.base - 1, end=sv.posedit.pos.end.base) else: raise ValueError("HGVS variant type {} is unsupported".format( sv.posedit.edit.type)) location = models.Location(sequence_id=sequence_id, interval=interval) location.id = computed_id(location) state = sv.posedit.edit.alt or '' allele = models.Allele(location_id=location.id, state=state) allele.id = computed_id(allele) bundle = models.Vmcbundle( alleles={allele.id: allele.as_dict()}, genotypes={}, haplotypes={}, identifiers={sequence_id: [ir.as_dict()]}, locations={location.id: location.as_dict()}, meta={"version": "0.1"}, ) return ppj(bundle)
def add_hgvs_haplotype(self, hgvs_alleles, completeness="UNKNOWN"): alleles = [self.add_hgvs_allele(hgvs_allele) for hgvs_allele in hgvs_alleles] # create location from bounding box around alleles sequence_ids = set(self.locations[a.location_id].sequence_id for a in alleles) if len(sequence_ids) > 1: raise Exception("Haplotypes must be defined on a single sequence") sequence_id = next(iter(sequence_ids)) intervals = [self.locations[a.location_id].interval for a in alleles] interval_min = min(int(i.start) for i in intervals) interval_max = max(int(i.end) for i in intervals) interval = models.Interval(start=interval_min, end=interval_max) location = models.Location(sequence_id=sequence_id, interval=interval) location.id = self._id_function(location) self.locations[location.id] = location haplotype = models.Haplotype( completeness=completeness, location_id=location.id, allele_ids=[a.id for a in alleles]) haplotype.id = self._id_function(haplotype) self.haplotypes[haplotype.id] = haplotype return haplotype
def _make_vmc_allele(a): """given dict (from CAR json) for single genomicAllele or transcriptAllele, create a (Location, Allele) pair, add to the bundle, and return the allele. """ car_rsid = a["referenceSequence"].split("/")[-1] ir = self._refseqmapper[car_rsid] sequence_id = get_vmc_sequence_identifier(ir) # N.B. Double check CA coordinate semantics # If HGVS like re: insertions, then end -= 1 below if len(a["coordinates"]) > 1: _logger.warn(f"More than one coordinate set for resp[@id]; using only first") coords = a["coordinates"][0] interval = models.Interval(start=coords["start"] - 1, end=coords["end"]) location = models.Location(sequence_id=sequence_id, interval=interval) location.id = computed_id(location) allele = models.Allele(location_id=location.id, state=coords["allele"]) allele.id = computed_id(allele) return (ir, sequence_id, location, allele)
import datetime import json from vmc import models, computed_id, serialize # Interval i = models.Interval(start=42, end=42) assert "<Interval|42|42>" == serialize(i) assert {"end": 42, "start": 42} == i.as_dict() # Location l = models.Location(sequence_id="VMC:GS_01234", interval=i) assert "<Location|VMC:GS_01234|<Interval|42|42>>" == serialize(l) l.id = computed_id(l) assert "VMC:GL_OUqODzxryILUEDmv7uF8R8NwREJAx7gN" == l.id assert { "id": "VMC:GL_OUqODzxryILUEDmv7uF8R8NwREJAx7gN", "interval": { "end": 42, "start": 42 }, "sequence_id": "VMC:GS_01234" } == l.as_dict() locations = {l.id: l.as_dict()} # Allele a = models.Allele(location_id=l.id, state="A") assert "<Allele|VMC:GL_OUqODzxryILUEDmv7uF8R8NwREJAx7gN|A>" == serialize(a) a.id = computed_id(a) assert "VMC:GA_xTR0mmMviMLoAI9SwmDMFYr_AZczkjyU" == a.id
def build_loc(seq_id, start): interval = models.Interval(start=start, end=start + 1) location = models.Location(sequence_id=seq_id, interval=interval) location.id = computed_id(location) return location.id