Пример #1
0
    def lcs(self, other, *args, limit=25, **kwargs):
        """Return the longest common substring between the sequence.

        and another sequence (other). The other sequence can be a string,
        Seq, SeqRecord, Dseq or DseqRecord.
        The method returns a SeqFeature with type "read" as this method
        is mostly used to map sequence reads to the sequence. This can be
        changed by passing a type as keyword with some other string value.

        Examples
        --------
        >>> from pydna.seqrecord import SeqRecord
        >>> a = SeqRecord("GGATCC")
        >>> a.lcs("GGATCC", limit=6)
        SeqFeature(FeatureLocation(ExactPosition(0),
                                   ExactPosition(6), strand=1), type='read')
        >>> a.lcs("GATC", limit=4)
        SeqFeature(FeatureLocation(ExactPosition(1),
                                   ExactPosition(5), strand=1), type='read')
        >>> a = SeqRecord("CCCCC")
        >>> a.lcs("GGATCC", limit=6)
        SeqFeature(None)

        """
        # longest_common_substring
        # https://biopython.org/wiki/ABI_traces
        if hasattr(other, "seq"):
            r = other.seq
            if hasattr(r, "watson"):
                r = str(r.watson).lower()
            else:
                r = str(r).lower()
        else:
            r = str(other.lower())

        olaps = _common_sub_strings(str(self.seq).lower(),
                                    r,
                                    limit=limit or 25)

        try:
            start_in_self, start_in_other, length = olaps.pop(0)
        except IndexError:
            result = _SeqFeature()
        else:
            label = "sequence" if not hasattr(other, "name") else other.name
            result = _SeqFeature(
                _FeatureLocation(start_in_self, start_in_self + length),
                type=kwargs.get("type") or "read",
                strand=1,
                qualifiers={
                    "label": [kwargs.get("label") or label],
                    "ApEinfo_fwdcolor": ["#DAFFCF"],
                    "ApEinfo_revcolor": ["#DFFDFF"],
                },
            )
        return result
Пример #2
0
 def olaps(self, other, *args, **kwargs):
     """Returns the overlaps between the sequence and another sequence,
     The other sequence can be a string, Seq, SeqRecord, Dseq or DseqRecord"""
     if hasattr(other, "seq"):
         r = other.seq
         if hasattr(r, "watson"):
             r = str(r.watson).lower()
         else:
             r = str(r).lower()
     else:
         r = str(other.lower())
     olaps = _common_sub_strings(str(self.seq).lower(), r, **kwargs)
     return [self[olap[0]:olap[0] + olap[2]] for olap in olaps]
Пример #3
0
    def __init__(self,
                 watson,
                 crick=None,
                 ovhg=None,
                 linear=None,
                 circular=None,
                 pos=0):

        if crick is None:
            if ovhg is None:
                crick = _rc(watson)
                ovhg = 0
                self._data = watson
            else:  # ovhg given, but no crick strand
                raise ValueError("ovhg defined without crick strand!")
        else:  # crick strand given
            if ovhg is None:  # ovhg not given

                olaps = _common_sub_strings(
                    str(watson).lower(),
                    str(_rc(crick).lower()),
                    int(_math.log(len(watson)) / _math.log(4)),
                )
                try:
                    F, T, L = olaps[0]
                except IndexError:
                    raise ValueError(
                        "Could not anneal the two strands. Please provide ovhg value"
                    )
                ovhgs = [ol[1] - ol[0] for ol in olaps if ol[2] == L]
                if len(ovhgs) > 1:
                    raise ValueError(
                        "More than one way of annealing the strands. Please provide ovhg value"
                    )
                ovhg = T - F

                sns = (ovhg * " ") + _pretty_str(watson)
                asn = (-ovhg * " ") + _pretty_str(_rc(crick))

                self._data = "".join([
                    a.strip() or b.strip()
                    for a, b in _itertools.zip_longest(sns, asn, fillvalue=" ")
                ])

            else:  # ovhg given
                if ovhg == 0:
                    if len(watson) == len(crick):
                        self._data = watson
                    elif len(watson) > len(crick):
                        self._data = watson
                    else:
                        self._data = watson + _rc(
                            crick[:len(crick) - len(watson)])
                elif ovhg > 0:
                    if ovhg + len(watson) > len(crick):
                        self._data = _rc(crick[-ovhg:]) + watson
                    else:
                        self._data = (
                            _rc(crick[-ovhg:]) + watson +
                            _rc(crick[:len(crick) - ovhg - len(watson)]))
                else:  # ovhg < 0
                    if -ovhg + len(crick) > len(watson):
                        self._data = watson + _rc(
                            crick[:-ovhg + len(crick) - len(watson)])
                    else:
                        self._data = watson

        self._circular = (bool(circular) and bool(linear) ^ bool(circular)
                          or linear == False and circular is None)
        self._linear = not self._circular
        self.watson = _pretty_str(watson)
        self.crick = _pretty_str(crick)
        # self.length = max(len(watson)+max(0,ovhg), len(crick)+max(0,-ovhg))
        self.length = len(self._data)
        self._ovhg = ovhg
        self.pos = pos
        self._data = self._data
Пример #4
0
    def map_trace_files(self, pth, limit=25):  # TODO allow path-like objects
        import glob

        traces = []
        for name in glob.glob(pth):
            trace = SeqIO.read(name, "abi").lower()
            trace.annotations["filename"] = trace.fname = name
            traces.append(trace)
        if not traces:
            raise ValueError("No trace files found in {}".format(pth))
        if hasattr(self.map_target, "step"):
            area = self.map_target
        elif hasattr(self.map_target, "extract"):
            area = slice(self.map_target.location.start,
                         self.map_target.location.end)
        else:
            area = None  # TODO allow other objects as well and do some checks on map target

        if area:
            self.matching_reads = []
            self.not_matching_reads = []
            target = str(self[area].seq).lower()
            target_rc = str(self[area].seq.rc()).lower()
            for trace in traces:
                if target in str(trace.seq) or target_rc in str(trace.seq):
                    self.matching_reads.append(trace)
                else:
                    self.not_matching_reads.append(trace)
            reads = self.matching_reads
        else:
            self.matching_reads = None
            self.not_matching_reads = None
            reads = traces

        matching_reads = []

        for read_ in reads:

            matches = _common_sub_strings(
                str(self.seq).lower(), str(read_.seq), limit)

            if not matches:
                continue

            if len(matches) > 1:
                newmatches = [
                    matches[0],
                ]
                for i, x in enumerate(matches[1:]):
                    g, f, h = matches[i]
                    if g + h < x[0] and f + h < x[1]:
                        newmatches.append(x)
            else:  # len(matches)==1
                newmatches = matches

            matching_reads.append(read_)

            if len(newmatches) > 1:
                ms = []
                for m in newmatches:
                    ms.append(_FeatureLocation(m[0], m[0] + m[2]))
                loc = _CompoundLocation(ms)
            else:
                a, b, c = newmatches[0]
                loc = _FeatureLocation(a, a + c)

            self.features.append(
                _SeqFeature(
                    loc,
                    qualifiers={"label": [read_.annotations["filename"]]},
                    type="trace",
                ))

        return [x.annotations["filename"] for x in matching_reads]
Пример #5
0
    def synced(self, ref, limit=25):
        """This method returns a new circular sequence (Dseqrecord object), which has been rotated
        in such a way that there is maximum overlap between the sequence and
        ref, which may be a string, Biopython Seq, SeqRecord object or
        another Dseqrecord object.

        The reason for using this could be to rotate a new recombinant plasmid so
        that it starts at the same position after cloning. See the example below:


        Examples
        --------

        >>> from pydna.dseqrecord import Dseqrecord
        >>> a=Dseqrecord("gaat",circular=True)
        >>> a.seq
        Dseq(o4)
        gaat
        ctta
        >>> d = a[2:] + a[:2]
        >>> d.seq
        Dseq(-4)
        atga
        tact
        >>> insert=Dseqrecord("CCC")
        >>> recombinant = (d+insert).looped()
        >>> recombinant.seq
        Dseq(o7)
        atgaCCC
        tactGGG
        >>> recombinant.synced(a).seq
        Dseq(o7)
        gaCCCat
        ctGGGta

        """

        if self.linear:
            raise TypeError("Only circular DNA can be synced!")

        newseq = _copy.copy(self)

        s = str(self.seq.watson).lower()
        s_rc = str(self.seq.crick).lower()

        if hasattr(ref, "seq"):
            r = ref.seq
            if hasattr(r, "watson"):
                r = str(r.watson).lower()
            else:
                r = str(r).lower()
        else:
            r = str(ref.lower())

        lim = min(limit, limit * (len(s) // limit) + 1)

        c = _common_sub_strings(s + s, r, limit=lim)
        d = _common_sub_strings(s_rc + s_rc, r, limit=lim)

        c = [(x[0], x[2]) for x in c if x[1] == 0]
        d = [(x[0], x[2]) for x in d if x[1] == 0]

        if not c and not d:
            raise TypeError("There is no overlap between sequences!")

        if c:
            start, length = c.pop(0)
        else:
            start, length = 0, 0

        if d:
            start_rc, length_rc = d.pop(0)
        else:
            start_rc, length_rc = 0, 0

        if length_rc > length:
            start = start_rc
            newseq = newseq.rc()

        if start == 0:
            result = newseq
        else:
            result = newseq.shifted(start)
        _module_logger.info("synced")
        return Dseqrecord(result)