예제 #1
0
 def _check_superfluous(self, path):
     """
     Checks if any additional provided sequence/length is correct.
     Incorrect descriptions examples:
         NM_000143.3:c.45delT
         NG_012337.1:g.274ATT>T
         NM_000143.3:c.45dupT
         NM_000143.3:c.45dup4
     :param path: Model path towards "deleted" or "inserted".
     """
     v_i = self.internal_indexing_model["variants"][path[1]]
     ins_or_del = path[-1]
     sequences = self.get_sequences()
     if (len(v_i[ins_or_del]) == 1 and v_i[ins_or_del][0].get("length")
             and v_i[ins_or_del][0]["length"].get("value")):
         len_del = v_i[ins_or_del][0]["length"].get("value")
         len_loc = get_location_length(v_i["location"])
         if len_loc != len_del:
             self._add_error(errors.length_mismatch(len_loc, len_del, path))
     else:
         seq_ref = slice_sequence(v_i["location"], sequences["reference"])
         seq_del = construct_sequence(v_i[ins_or_del], sequences)
         if seq_del and seq_ref and seq_del != seq_ref:
             if self.is_inverted():
                 seq_del = reverse_complement(seq_del)
                 seq_ref = reverse_complement(seq_ref)
                 path = reverse_path(self.internal_coordinates_model, path)
             self._add_error(
                 errors.sequence_mismatch(seq_ref, seq_del, path))
예제 #2
0
def reverse_strand_shift(variants, seq):
    for variant in variants:
        if variant.get("inserted"):
            variant["inserted"].reverse()
            if (
                len(variant["inserted"]) == 1
                and variant["inserted"][0].get("sequence")
                and variant["location"]["start"].get("shift")
            ):
                # TODO: Check what to do when there is a compound insertion with locations included.
                start = get_start(variant)
                shift = variant["location"]["start"]["shift"]
                ins_seq = variant["inserted"][0]["sequence"]
                new_ins_seq = reverse_complement(
                    (seq[start - shift : start] + ins_seq)[: len(ins_seq)]
                )
                variant["inserted"][0]["sequence"] = new_ins_seq
            else:
                for inserted in variant["inserted"]:
                    if inserted.get("sequence"):
                        inserted["sequence"] = reverse_complement(inserted["sequence"])
        if variant.get("deleted"):
            variant["deleted"].reverse()
            for deleted in variant["deleted"]:
                if deleted.get("sequence"):
                    deleted["sequence"] = reverse_complement(deleted["sequence"])
예제 #3
0
def map_description(
    description,
    reference_id,
    selector_id=None,
    slice_to=None,
    clean=False,
):
    # Get the observed sequence
    d = Description(description)
    d.normalize()
    if d.errors:
        return {"errors": d.errors}
    if not d.references and not d.references.get("observed"):
        return {"errors": [{"details": "No observed sequence or other error occured."}]}
    obs_seq = d.references["observed"]["sequence"]["seq"]

    r_model = retrieve_reference(reference_id)
    if r_model is None:
        return {"errors": [reference_not_retrieved(reference_id, [])]}

    ref_seq2 = d.references["reference"]["sequence"]["seq"]

    if selector_id:
        s_model = get_selector_model(r_model["annotations"], selector_id, True)
        if s_model is None:
            return {"errors": [no_selector_found(reference_id, selector_id, [])]}
        if s_model["inverted"]:
            obs_seq = reverse_complement(obs_seq)
            ref_seq2 = reverse_complement(ref_seq2)

    if slice_to:
        r_model = _get_reference_model(r_model, selector_id, slice_to)

    ref_seq1 = r_model["sequence"]["seq"]

    # Get the description extractor hgvs internal indexing variants
    variants = _extract_hgvs_internal_model(obs_seq, r_model)

    if clean:
        raw_de_variants = extractor.describe_dna(ref_seq1, ref_seq2)
        seq_variants = de_to_hgvs(
            raw_de_variants,
            {"reference": ref_seq1, "observed": ref_seq2},
        )
        if [v for v in seq_variants if v not in variants]:
            return {
                "errors": [{"code": "EMAPFILTER", "details": "Unsuccessful filtering."}]
            }
        variants = [v for v in variants if v not in seq_variants]

    return _get_description(variants, r_model, selector_id)
예제 #4
0
    def _check_repeat(self, path):
        if self.is_inverted():
            path_i = reverse_path(self.internal_indexing_model, path)
        else:
            path_i = path
        v = self.input_model["variants"][path_i[1]]
        v_i = self.internal_indexing_model["variants"][path[1]]
        if v_i.get("inserted") and len(v_i.get("inserted")) == 1:
            inserted = v_i["inserted"][0]
            if inserted.get("sequence") and inserted.get(
                    "source") == "description":
                repeat_seq = inserted["sequence"]
            # TODO: get the sequence from a reference slice
            else:
                self._add_error(errors.repeat_not_supported(v, path))
                return

            ref_seq = self.references["reference"]["sequence"]["seq"][
                get_start(v_i):get_end(v_i)]
            if self.is_inverted():
                ref_seq = reverse_complement(ref_seq)

            if len(ref_seq) % len(repeat_seq) != 0:
                self._add_error(errors.repeat_reference_sequence_length(path))
            elif (len(ref_seq) // len(repeat_seq)) * repeat_seq != ref_seq:
                self._add_error(
                    errors.repeat_sequences_mismatch(ref_seq, repeat_seq,
                                                     path))
        else:
            # TODO: Convert to delins and switch to warning?
            self._add_error(errors.repeat_not_supported(v, path))
예제 #5
0
def slice_to_selector(model, selector_id, strand=False):
    s_m = get_selector_model(model["annotations"], selector_id)
    output = ""
    for slice in s_m["exon"]:
        output += model["sequence"]["seq"][slice[0]:slice[1]]
    print(s_m)
    if strand and s_m["inverted"]:
        output = reverse_complement(output)
    return output
예제 #6
0
def merge_inserted_to_string(inserted, sequences):
    inserted_value = ""
    for insertion in inserted:
        if insertion.get("sequence"):
            inserted_value += insertion.get("sequence")
        else:
            inserted_value += get_inserted_sequence(insertion, sequences)
        if insertion.get("inverted"):
            inserted_value = reverse_complement(inserted_value)

    return {"source": "description", "sequence": inserted_value}
예제 #7
0
def _get_ref_seq(r_model, selector_id=None):
    """

    :param r_model:
    :param selector_id:
    :param slice_to:
    :return:
    """
    ref_seq = r_model["sequence"]["seq"]
    if selector_id:
        s_model = get_selector_model(r_model, selector_id, True)
        if s_model["inverted"]:
            ref_seq = reverse_complement(ref_seq)
    return ref_seq
예제 #8
0
def construct_sequence(slices, sequences):
    seq = ""
    for slice in slices:
        if slice.get("sequence"):
            slice_seq = slice["sequence"]
        elif slice.get("location"):
            slice_seq = slice_sequence(slice["location"],
                                       sequences[slice["source"]])
        else:
            raise Exception("Unrecognized slice", slice)
        if slice.get("repeat_number") and slice["repeat_number"].get(
                "type") == "point":
            slice_seq = slice_seq * slice["repeat_number"]["value"]
        if slice.get("inverted"):
            slice_seq = reverse_complement(slice_seq)
        seq += slice_seq

    return seq
예제 #9
0
def _fix_ensembl(r_m, r_id):
    if "." in r_id:
        r_id = r_id.split(".")[0]
    f_m = extract_feature_model(r_m["annotations"], r_id, ancestors=False)[0]
    if f_m["location"]["strand"] == -1:
        r_m["sequence"]["seq"] = reverse_complement(r_m["sequence"]["seq"])
    f_id = f_m["id"] + "." + f_m["qualifiers"]["version"]
    if f_m["type"] == "mRNA":
        f_p = get_feature_path(r_m["annotations"], r_id)
        gene_model = get_submodel_by_path(r_m["annotations"], f_p[:-2])
        gene_model["features"] = [f_m]
        f_m = gene_model
    _update_ensembl_ids(f_m)
    r_m["annotations"]["features"] = [f_m]
    r_m["annotations"]["id"] = f_id
    if r_m["annotations"].get("qualifiers") is None:
        r_m["annotations"]["qualifiers"] = {}
    r_m["annotations"]["qualifiers"]["mol_type"] = "genomic DNA"
    _update_ensembl_locations(
        r_m["annotations"],
        r_m["annotations"]["location"]["start"]["position"])
    return r_m
예제 #10
0
def get_protein_description(variants, references, selector_model):
    """
    Retrieves the protein description.

    :param variants: Only deletion_insertion variants with coordinate locations.
                     Preferable, from the description extractor.
    :param references: References models. Required to be able to retrieve the
                       inserted sequences.
    :param selector_model: The selector model that includes the exon and cds
                           information.
    """
    sequences = extract_sequences(references)
    ref_id = references["reference"]["annotations"]["id"]
    dna_ref_seq = sequences[ref_id]
    exons = selector_model["exon"]
    cds = [selector_model["cds"][0][0], selector_model["cds"][0][1]]
    protein_id = selector_model["protein_id"]

    cds_seq = slice_seq(dna_ref_seq, exons, cds[0], cds[1])

    if selector_model["inverted"]:
        cds_seq = reverse_complement(cds_seq)
        cds_seq_ext = reverse_complement(slice_seq(dna_ref_seq, exons, 0, cds[1]))
    else:
        cds_seq_ext = slice_seq(dna_ref_seq, exons, cds[0])

    p_ref_seq = str(Seq(cds_seq).translate())

    cds_variants, splice_site_hits = to_cds_coordinate(
        variants, sequences, selector_model
    )

    if splice_site_hits:
        return "{}({}):{}".format(ref_id, protein_id, "p.?"), p_ref_seq, "?"
    elif not cds_variants:
        return "{}({}):{}".format(ref_id, protein_id, "p.(=)"), p_ref_seq, p_ref_seq

    cds_obs_seq = mutate({"reference": cds_seq_ext}, cds_variants)

    p_obs_seq = str(Seq(cds_obs_seq).translate())

    if cds_seq[:3] != cds_obs_seq[:3]:
        return "{}({}):{}".format(ref_id, protein_id, "p.?"), p_ref_seq, "?"

    # Up to and including the first '*', or the entire string.
    try:
        stop = p_obs_seq.index("*")
        p_obs_seq = p_obs_seq[: stop + 1]
    except ValueError:
        pass

    cds_stop = len(mutate({"reference": cds_seq}, cds_variants))
    description = protein_description(cds_stop, p_ref_seq, p_obs_seq)

    if len(cds_variants) > 1:
        # TODO: This seems to happen in M2. Check why.
        return (
            "{}({}):{}".format(ref_id, protein_id, "p.?"),
            p_ref_seq,
            p_obs_seq,
        )

    return (
        "{}({}):{}".format(ref_id, protein_id, description[0]),
        p_ref_seq,
        p_obs_seq,
    )