def _check_superfluous(self, path): """ Checks if any additional provided sequence/length is correct. Incorrect descriptions examples: NM_000143.3:c.45delT NG_012337.1:g.274ATT>T NM_000143.3:c.45dupT NM_000143.3:c.45dup4 :param path: Model path towards "deleted" or "inserted". """ v_i = self.internal_indexing_model["variants"][path[1]] ins_or_del = path[-1] sequences = self.get_sequences() if (len(v_i[ins_or_del]) == 1 and v_i[ins_or_del][0].get("length") and v_i[ins_or_del][0]["length"].get("value")): len_del = v_i[ins_or_del][0]["length"].get("value") len_loc = get_location_length(v_i["location"]) if len_loc != len_del: self._add_error(errors.length_mismatch(len_loc, len_del, path)) else: seq_ref = slice_sequence(v_i["location"], sequences["reference"]) seq_del = construct_sequence(v_i[ins_or_del], sequences) if seq_del and seq_ref and seq_del != seq_ref: if self.is_inverted(): seq_del = reverse_complement(seq_del) seq_ref = reverse_complement(seq_ref) path = reverse_path(self.internal_coordinates_model, path) self._add_error( errors.sequence_mismatch(seq_ref, seq_del, path))
def reverse_strand_shift(variants, seq): for variant in variants: if variant.get("inserted"): variant["inserted"].reverse() if ( len(variant["inserted"]) == 1 and variant["inserted"][0].get("sequence") and variant["location"]["start"].get("shift") ): # TODO: Check what to do when there is a compound insertion with locations included. start = get_start(variant) shift = variant["location"]["start"]["shift"] ins_seq = variant["inserted"][0]["sequence"] new_ins_seq = reverse_complement( (seq[start - shift : start] + ins_seq)[: len(ins_seq)] ) variant["inserted"][0]["sequence"] = new_ins_seq else: for inserted in variant["inserted"]: if inserted.get("sequence"): inserted["sequence"] = reverse_complement(inserted["sequence"]) if variant.get("deleted"): variant["deleted"].reverse() for deleted in variant["deleted"]: if deleted.get("sequence"): deleted["sequence"] = reverse_complement(deleted["sequence"])
def map_description( description, reference_id, selector_id=None, slice_to=None, clean=False, ): # Get the observed sequence d = Description(description) d.normalize() if d.errors: return {"errors": d.errors} if not d.references and not d.references.get("observed"): return {"errors": [{"details": "No observed sequence or other error occured."}]} obs_seq = d.references["observed"]["sequence"]["seq"] r_model = retrieve_reference(reference_id) if r_model is None: return {"errors": [reference_not_retrieved(reference_id, [])]} ref_seq2 = d.references["reference"]["sequence"]["seq"] if selector_id: s_model = get_selector_model(r_model["annotations"], selector_id, True) if s_model is None: return {"errors": [no_selector_found(reference_id, selector_id, [])]} if s_model["inverted"]: obs_seq = reverse_complement(obs_seq) ref_seq2 = reverse_complement(ref_seq2) if slice_to: r_model = _get_reference_model(r_model, selector_id, slice_to) ref_seq1 = r_model["sequence"]["seq"] # Get the description extractor hgvs internal indexing variants variants = _extract_hgvs_internal_model(obs_seq, r_model) if clean: raw_de_variants = extractor.describe_dna(ref_seq1, ref_seq2) seq_variants = de_to_hgvs( raw_de_variants, {"reference": ref_seq1, "observed": ref_seq2}, ) if [v for v in seq_variants if v not in variants]: return { "errors": [{"code": "EMAPFILTER", "details": "Unsuccessful filtering."}] } variants = [v for v in variants if v not in seq_variants] return _get_description(variants, r_model, selector_id)
def _check_repeat(self, path): if self.is_inverted(): path_i = reverse_path(self.internal_indexing_model, path) else: path_i = path v = self.input_model["variants"][path_i[1]] v_i = self.internal_indexing_model["variants"][path[1]] if v_i.get("inserted") and len(v_i.get("inserted")) == 1: inserted = v_i["inserted"][0] if inserted.get("sequence") and inserted.get( "source") == "description": repeat_seq = inserted["sequence"] # TODO: get the sequence from a reference slice else: self._add_error(errors.repeat_not_supported(v, path)) return ref_seq = self.references["reference"]["sequence"]["seq"][ get_start(v_i):get_end(v_i)] if self.is_inverted(): ref_seq = reverse_complement(ref_seq) if len(ref_seq) % len(repeat_seq) != 0: self._add_error(errors.repeat_reference_sequence_length(path)) elif (len(ref_seq) // len(repeat_seq)) * repeat_seq != ref_seq: self._add_error( errors.repeat_sequences_mismatch(ref_seq, repeat_seq, path)) else: # TODO: Convert to delins and switch to warning? self._add_error(errors.repeat_not_supported(v, path))
def slice_to_selector(model, selector_id, strand=False): s_m = get_selector_model(model["annotations"], selector_id) output = "" for slice in s_m["exon"]: output += model["sequence"]["seq"][slice[0]:slice[1]] print(s_m) if strand and s_m["inverted"]: output = reverse_complement(output) return output
def merge_inserted_to_string(inserted, sequences): inserted_value = "" for insertion in inserted: if insertion.get("sequence"): inserted_value += insertion.get("sequence") else: inserted_value += get_inserted_sequence(insertion, sequences) if insertion.get("inverted"): inserted_value = reverse_complement(inserted_value) return {"source": "description", "sequence": inserted_value}
def _get_ref_seq(r_model, selector_id=None): """ :param r_model: :param selector_id: :param slice_to: :return: """ ref_seq = r_model["sequence"]["seq"] if selector_id: s_model = get_selector_model(r_model, selector_id, True) if s_model["inverted"]: ref_seq = reverse_complement(ref_seq) return ref_seq
def construct_sequence(slices, sequences): seq = "" for slice in slices: if slice.get("sequence"): slice_seq = slice["sequence"] elif slice.get("location"): slice_seq = slice_sequence(slice["location"], sequences[slice["source"]]) else: raise Exception("Unrecognized slice", slice) if slice.get("repeat_number") and slice["repeat_number"].get( "type") == "point": slice_seq = slice_seq * slice["repeat_number"]["value"] if slice.get("inverted"): slice_seq = reverse_complement(slice_seq) seq += slice_seq return seq
def _fix_ensembl(r_m, r_id): if "." in r_id: r_id = r_id.split(".")[0] f_m = extract_feature_model(r_m["annotations"], r_id, ancestors=False)[0] if f_m["location"]["strand"] == -1: r_m["sequence"]["seq"] = reverse_complement(r_m["sequence"]["seq"]) f_id = f_m["id"] + "." + f_m["qualifiers"]["version"] if f_m["type"] == "mRNA": f_p = get_feature_path(r_m["annotations"], r_id) gene_model = get_submodel_by_path(r_m["annotations"], f_p[:-2]) gene_model["features"] = [f_m] f_m = gene_model _update_ensembl_ids(f_m) r_m["annotations"]["features"] = [f_m] r_m["annotations"]["id"] = f_id if r_m["annotations"].get("qualifiers") is None: r_m["annotations"]["qualifiers"] = {} r_m["annotations"]["qualifiers"]["mol_type"] = "genomic DNA" _update_ensembl_locations( r_m["annotations"], r_m["annotations"]["location"]["start"]["position"]) return r_m
def get_protein_description(variants, references, selector_model): """ Retrieves the protein description. :param variants: Only deletion_insertion variants with coordinate locations. Preferable, from the description extractor. :param references: References models. Required to be able to retrieve the inserted sequences. :param selector_model: The selector model that includes the exon and cds information. """ sequences = extract_sequences(references) ref_id = references["reference"]["annotations"]["id"] dna_ref_seq = sequences[ref_id] exons = selector_model["exon"] cds = [selector_model["cds"][0][0], selector_model["cds"][0][1]] protein_id = selector_model["protein_id"] cds_seq = slice_seq(dna_ref_seq, exons, cds[0], cds[1]) if selector_model["inverted"]: cds_seq = reverse_complement(cds_seq) cds_seq_ext = reverse_complement(slice_seq(dna_ref_seq, exons, 0, cds[1])) else: cds_seq_ext = slice_seq(dna_ref_seq, exons, cds[0]) p_ref_seq = str(Seq(cds_seq).translate()) cds_variants, splice_site_hits = to_cds_coordinate( variants, sequences, selector_model ) if splice_site_hits: return "{}({}):{}".format(ref_id, protein_id, "p.?"), p_ref_seq, "?" elif not cds_variants: return "{}({}):{}".format(ref_id, protein_id, "p.(=)"), p_ref_seq, p_ref_seq cds_obs_seq = mutate({"reference": cds_seq_ext}, cds_variants) p_obs_seq = str(Seq(cds_obs_seq).translate()) if cds_seq[:3] != cds_obs_seq[:3]: return "{}({}):{}".format(ref_id, protein_id, "p.?"), p_ref_seq, "?" # Up to and including the first '*', or the entire string. try: stop = p_obs_seq.index("*") p_obs_seq = p_obs_seq[: stop + 1] except ValueError: pass cds_stop = len(mutate({"reference": cds_seq}, cds_variants)) description = protein_description(cds_stop, p_ref_seq, p_obs_seq) if len(cds_variants) > 1: # TODO: This seems to happen in M2. Check why. return ( "{}({}):{}".format(ref_id, protein_id, "p.?"), p_ref_seq, p_obs_seq, ) return ( "{}({}):{}".format(ref_id, protein_id, description[0]), p_ref_seq, p_obs_seq, )