def lcs(self, other, *args, limit=25, **kwargs): """Return the longest common substring between the sequence. and another sequence (other). The other sequence can be a string, Seq, SeqRecord, Dseq or DseqRecord. The method returns a SeqFeature with type "read" as this method is mostly used to map sequence reads to the sequence. This can be changed by passing a type as keyword with some other string value. Examples -------- >>> from pydna.seqrecord import SeqRecord >>> a = SeqRecord("GGATCC") >>> a.lcs("GGATCC", limit=6) SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(6), strand=1), type='read') >>> a.lcs("GATC", limit=4) SeqFeature(FeatureLocation(ExactPosition(1), ExactPosition(5), strand=1), type='read') >>> a = SeqRecord("CCCCC") >>> a.lcs("GGATCC", limit=6) SeqFeature(None) """ # longest_common_substring # https://biopython.org/wiki/ABI_traces if hasattr(other, "seq"): r = other.seq if hasattr(r, "watson"): r = str(r.watson).lower() else: r = str(r).lower() else: r = str(other.lower()) olaps = _common_sub_strings(str(self.seq).lower(), r, limit=limit or 25) try: start_in_self, start_in_other, length = olaps.pop(0) except IndexError: result = _SeqFeature() else: label = "sequence" if not hasattr(other, "name") else other.name result = _SeqFeature( _FeatureLocation(start_in_self, start_in_self + length), type=kwargs.get("type") or "read", strand=1, qualifiers={ "label": [kwargs.get("label") or label], "ApEinfo_fwdcolor": ["#DAFFCF"], "ApEinfo_revcolor": ["#DFFDFF"], }, ) return result
def olaps(self, other, *args, **kwargs): """Returns the overlaps between the sequence and another sequence, The other sequence can be a string, Seq, SeqRecord, Dseq or DseqRecord""" if hasattr(other, "seq"): r = other.seq if hasattr(r, "watson"): r = str(r.watson).lower() else: r = str(r).lower() else: r = str(other.lower()) olaps = _common_sub_strings(str(self.seq).lower(), r, **kwargs) return [self[olap[0]:olap[0] + olap[2]] for olap in olaps]
def __init__(self, watson, crick=None, ovhg=None, linear=None, circular=None, pos=0): if crick is None: if ovhg is None: crick = _rc(watson) ovhg = 0 self._data = watson else: # ovhg given, but no crick strand raise ValueError("ovhg defined without crick strand!") else: # crick strand given if ovhg is None: # ovhg not given olaps = _common_sub_strings( str(watson).lower(), str(_rc(crick).lower()), int(_math.log(len(watson)) / _math.log(4)), ) try: F, T, L = olaps[0] except IndexError: raise ValueError( "Could not anneal the two strands. Please provide ovhg value" ) ovhgs = [ol[1] - ol[0] for ol in olaps if ol[2] == L] if len(ovhgs) > 1: raise ValueError( "More than one way of annealing the strands. Please provide ovhg value" ) ovhg = T - F sns = (ovhg * " ") + _pretty_str(watson) asn = (-ovhg * " ") + _pretty_str(_rc(crick)) self._data = "".join([ a.strip() or b.strip() for a, b in _itertools.zip_longest(sns, asn, fillvalue=" ") ]) else: # ovhg given if ovhg == 0: if len(watson) == len(crick): self._data = watson elif len(watson) > len(crick): self._data = watson else: self._data = watson + _rc( crick[:len(crick) - len(watson)]) elif ovhg > 0: if ovhg + len(watson) > len(crick): self._data = _rc(crick[-ovhg:]) + watson else: self._data = ( _rc(crick[-ovhg:]) + watson + _rc(crick[:len(crick) - ovhg - len(watson)])) else: # ovhg < 0 if -ovhg + len(crick) > len(watson): self._data = watson + _rc( crick[:-ovhg + len(crick) - len(watson)]) else: self._data = watson self._circular = (bool(circular) and bool(linear) ^ bool(circular) or linear == False and circular is None) self._linear = not self._circular self.watson = _pretty_str(watson) self.crick = _pretty_str(crick) # self.length = max(len(watson)+max(0,ovhg), len(crick)+max(0,-ovhg)) self.length = len(self._data) self._ovhg = ovhg self.pos = pos self._data = self._data
def map_trace_files(self, pth, limit=25): # TODO allow path-like objects import glob traces = [] for name in glob.glob(pth): trace = SeqIO.read(name, "abi").lower() trace.annotations["filename"] = trace.fname = name traces.append(trace) if not traces: raise ValueError("No trace files found in {}".format(pth)) if hasattr(self.map_target, "step"): area = self.map_target elif hasattr(self.map_target, "extract"): area = slice(self.map_target.location.start, self.map_target.location.end) else: area = None # TODO allow other objects as well and do some checks on map target if area: self.matching_reads = [] self.not_matching_reads = [] target = str(self[area].seq).lower() target_rc = str(self[area].seq.rc()).lower() for trace in traces: if target in str(trace.seq) or target_rc in str(trace.seq): self.matching_reads.append(trace) else: self.not_matching_reads.append(trace) reads = self.matching_reads else: self.matching_reads = None self.not_matching_reads = None reads = traces matching_reads = [] for read_ in reads: matches = _common_sub_strings( str(self.seq).lower(), str(read_.seq), limit) if not matches: continue if len(matches) > 1: newmatches = [ matches[0], ] for i, x in enumerate(matches[1:]): g, f, h = matches[i] if g + h < x[0] and f + h < x[1]: newmatches.append(x) else: # len(matches)==1 newmatches = matches matching_reads.append(read_) if len(newmatches) > 1: ms = [] for m in newmatches: ms.append(_FeatureLocation(m[0], m[0] + m[2])) loc = _CompoundLocation(ms) else: a, b, c = newmatches[0] loc = _FeatureLocation(a, a + c) self.features.append( _SeqFeature( loc, qualifiers={"label": [read_.annotations["filename"]]}, type="trace", )) return [x.annotations["filename"] for x in matching_reads]
def synced(self, ref, limit=25): """This method returns a new circular sequence (Dseqrecord object), which has been rotated in such a way that there is maximum overlap between the sequence and ref, which may be a string, Biopython Seq, SeqRecord object or another Dseqrecord object. The reason for using this could be to rotate a new recombinant plasmid so that it starts at the same position after cloning. See the example below: Examples -------- >>> from pydna.dseqrecord import Dseqrecord >>> a=Dseqrecord("gaat",circular=True) >>> a.seq Dseq(o4) gaat ctta >>> d = a[2:] + a[:2] >>> d.seq Dseq(-4) atga tact >>> insert=Dseqrecord("CCC") >>> recombinant = (d+insert).looped() >>> recombinant.seq Dseq(o7) atgaCCC tactGGG >>> recombinant.synced(a).seq Dseq(o7) gaCCCat ctGGGta """ if self.linear: raise TypeError("Only circular DNA can be synced!") newseq = _copy.copy(self) s = str(self.seq.watson).lower() s_rc = str(self.seq.crick).lower() if hasattr(ref, "seq"): r = ref.seq if hasattr(r, "watson"): r = str(r.watson).lower() else: r = str(r).lower() else: r = str(ref.lower()) lim = min(limit, limit * (len(s) // limit) + 1) c = _common_sub_strings(s + s, r, limit=lim) d = _common_sub_strings(s_rc + s_rc, r, limit=lim) c = [(x[0], x[2]) for x in c if x[1] == 0] d = [(x[0], x[2]) for x in d if x[1] == 0] if not c and not d: raise TypeError("There is no overlap between sequences!") if c: start, length = c.pop(0) else: start, length = 0, 0 if d: start_rc, length_rc = d.pop(0) else: start_rc, length_rc = 0, 0 if length_rc > length: start = start_rc newseq = newseq.rc() if start == 0: result = newseq else: result = newseq.shifted(start) _module_logger.info("synced") return Dseqrecord(result)