def lcs(self, other, *args, limit=25, **kwargs): """Return the longest common substring between the sequence. and another sequence (other). The other sequence can be a string, Seq, SeqRecord, Dseq or DseqRecord. The method returns a SeqFeature with type "read" as this method is mostly used to map sequence reads to the sequence. This can be changed by passing a type as keyword with some other string value. Examples -------- >>> from pydna.seqrecord import SeqRecord >>> a = SeqRecord("GGATCC") >>> a.lcs("GGATCC", limit=6) SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(6), strand=1), type='read') >>> a.lcs("GATC", limit=4) SeqFeature(FeatureLocation(ExactPosition(1), ExactPosition(5), strand=1), type='read') >>> a = SeqRecord("CCCCC") >>> a.lcs("GGATCC", limit=6) SeqFeature(None) """ # longest_common_substring # https://biopython.org/wiki/ABI_traces if hasattr(other, "seq"): r = other.seq if hasattr(r, "watson"): r = str(r.watson).lower() else: r = str(r).lower() else: r = str(other.lower()) olaps = _common_sub_strings(str(self.seq).lower(), r, limit=limit or 25) try: start_in_self, start_in_other, length = olaps.pop(0) except IndexError: result = _SeqFeature() else: label = "sequence" if not hasattr(other, "name") else other.name result = _SeqFeature( _FeatureLocation(start_in_self, start_in_self + length), type=kwargs.get("type") or "read", strand=1, qualifiers={ "label": [kwargs.get("label") or label], "ApEinfo_fwdcolor": ["#DAFFCF"], "ApEinfo_revcolor": ["#DFFDFF"], }, ) return result
def embl_gb_fasta(raw, ds, path=None): pattern = r"(?:>.+\n^(?:^[^>]+?)(?=\n\n|>|LOCUS|ID))|(?:(?:LOCUS|ID)(?:(?:.|\n)+?)^//)" result_list = [] rawseqs = _re.findall(pattern, _textwrap.dedent(raw + "\n\n"), flags=_re.MULTILINE) for rawseq in rawseqs: format_ = None handle = _io.StringIO(rawseq) if "circular" in rawseq.splitlines()[0]: circular = True else: circular = False try: parsed = _SeqIO.read(handle, "embl", alphabet=_IUPACAmbiguousDNA()) except ValueError: handle.seek(0) try: parsed = _SeqIO.read(handle, "genbank", alphabet=_IUPACAmbiguousDNA()) handle.seek(0) parser = _RecordParser() residue_type = parser.parse(handle).residue_type if "circular" in residue_type : circular = True else: try: if parsed.annotations["topology"] == "circular": circular = True else: circular = False except KeyError: circular = False except ValueError: handle.seek(0) try: parsed = _SeqIO.read(handle, "fasta", alphabet=_IUPACAmbiguousDNA()) except ValueError: parsed = "" else: format_= "fasta" else: format_= "genbank" else: format_ = "embl" handle.close() if parsed: from copy import deepcopy as _deepcopy ## TODO: clean up ! from pydna.seqfeature import SeqFeature as _SeqFeature nfs = [_SeqFeature() for f in parsed.features] for f, nf in zip(parsed.features, nfs): nf.__dict__ =_deepcopy(f.__dict__) parsed.features = nfs if ds and path: result_list.append( _GenbankFile.from_SeqRecord(parsed, linear=not circular, circular=circular, path=path) ) elif ds: result_list.append ( _Dseqrecord.from_SeqRecord(parsed, linear=not circular, circular=circular) ) else: result_list.append( parsed ) return result_list
def embl_gb_fasta(raw, ds, path=None): pattern = (r"(?:>.+\n^(?:^[^>]+?)(?=\n\n|>|" r"LOCUS|ID))|(?:(?:LOCUS|ID)(?:(?:.|\n)+?)^//)") result_list = [] rawseqs = _re.findall(pattern, _textwrap.dedent(raw + "\n\n"), flags=_re.MULTILINE) for rawseq in rawseqs: handle = _io.StringIO(rawseq) circular = False try: parsed = _SeqIO.read(handle, "embl") except ValueError: handle.seek(0) try: parsed = _SeqIO.read(handle, "genbank") if "circular" in str( parsed.annotations.get("topology")).lower(): circular = True except ValueError: handle.seek(0) try: parsed = _SeqIO.read(handle, "fasta") except ValueError: parsed = "" handle.close() if ("circular" in rawseq.splitlines()[0].lower().split() ): # hack to pick up topology from malformed files circular = True if parsed: from copy import deepcopy as _deepcopy # TODO: clean up ! from pydna.seqfeature import SeqFeature as _SeqFeature nfs = [_SeqFeature() for f in parsed.features] for f, nf in zip(parsed.features, nfs): nf.__dict__ = _deepcopy(f.__dict__) parsed.features = nfs if ds and path: result_list.append( _GenbankFile.from_SeqRecord(parsed, linear=not circular, circular=circular, path=path)) elif ds: result_list.append( _Dseqrecord.from_SeqRecord(parsed, linear=not circular, circular=circular)) else: result_list.append(parsed) return result_list
def rarecodons(self, organism="sce"): """docstring.""" rare = _rare_codons[organism] s = str(self.seq).upper() sfs = [] for i in range(0, len(self) // 3): x, y = i * 3, i * 3 + 3 trip = s[x:y] if trip in rare: sfs.append( _SeqFeature(_FeatureLocation(x, y), type=f"rare_codon_{organism}", qualifiers={"label": trip})) return sfs
def __init__(self, primers, template, limit=13, **kwargs): r"""The Anneal class has to be initiated with at least an iterable of primers and a template. Parameters ---------- primers : iterable of :class:`Primer` or Biopython SeqRecord like objects Primer sequences 5'-3'. template : Dseqrecord The template sequence 5'-3'. limit : int, optional limit length of the annealing part of the primers. Attributes ---------- products: list A list of Amplicon objects, one for each primer pair that may form a PCR product. Examples -------- >>> from pydna.readers import read >>> from pydna.amplify import Anneal >>> from pydna.dseqrecord import Dseqrecord as Ds >>> t = Ds("tacactcaccgtctatcattatcta" ... "ctatcgactgtatcatctgatagcac") >>> from Bio.SeqRecord import SeqRecord >>> p1 = read(">p1\ntacactcaccgtctatcattatc", ds = False) >>> p2 = read(">p2\ngtgctatcagatgatacagtcg", ds = False) >>> ann = Anneal((p1, p2), t) >>> print(ann.report()) Template name 51 nt linear: p1 anneals forward (--->) at 23 p2 anneals reverse (<---) at 29 >>> ann.products [Amplicon(51)] >>> amplicon_list = ann.products >>> amplicon = amplicon_list.pop() >>> amplicon Amplicon(51) >>> print(amplicon.figure()) 5tacactcaccgtctatcattatc...cgactgtatcatctgatagcac3 |||||||||||||||||||||| 3gctgacatagtagactatcgtg5 5tacactcaccgtctatcattatc3 ||||||||||||||||||||||| 3atgtgagtggcagatagtaatag...gctgacatagtagactatcgtg5 >>> print(amplicon) Dseqrecord circular: False size: 51 ID: 51bp U96-TO06Y6pFs74SQx8M1IVTBiY Name: 51bp_PCR_prod Description: pcr product_p1_p2 Number of features: 2 /molecule_type=DNA Dseq(-51) taca..gcac atgt..cgtg >>> print(amplicon.program()) <BLANKLINE> |95°C|95°C | |tmf:59.5 |____|_____ 72°C|72°C|tmr:59.7 |5min|30s \ 47.7°C _____|____|30s/kb | | \______/ 0: 1|5min|GC 39% | | 30s | |51bp >>> """ self.primers = primers self.template = _copy.deepcopy(template) self.limit = limit self.kwargs = kwargs self._products = None self.forward_primers = [] self.reverse_primers = [] twl = len(self.template.seq.watson) tcl = len(self.template.seq.crick) if self.template.linear: tw = self.template.seq.watson tc = self.template.seq.crick else: tw = self.template.seq.watson + self.template.seq.watson tc = self.template.seq.crick + self.template.seq.crick for p in self.primers: self.forward_primers.extend(( _Primer( p, # template = self.template, position=tcl - pos - min(self.template.seq.ovhg, 0), footprint=fp, ) for pos, fp in _annealing_positions(str(p.seq), tc, self.limit) if pos < tcl)) self.reverse_primers.extend(( _Primer( p, # template = self.template, position=pos + max(0, self.template.seq.ovhg), footprint=fp, ) for pos, fp in _annealing_positions(str(p.seq), tw, self.limit) if pos < twl)) self.forward_primers.sort(key=_operator.attrgetter("position")) self.reverse_primers.sort(key=_operator.attrgetter("position"), reverse=True) for fp in self.forward_primers: if fp.position - fp._fp >= 0: start = fp.position - fp._fp end = fp.position self.template.features.append( _SeqFeature( _FeatureLocation(start, end), type="primer_bind", strand=1, qualifiers={ "label": [fp.name], "ApEinfo_fwdcolor": ["#baffa3"], "ApEinfo_revcolor": ["#ffbaba"], }, )) else: start = len(self.template) - fp._fp + fp.position end = start + fp._fp - len(self.template) sf = _SeqFeature( _CompoundLocation([ _FeatureLocation(start, len(self.template)), _FeatureLocation(0, end), ]), type="primer_bind", location_operator="join", qualifiers={ "label": [fp.name], "ApEinfo_fwdcolor": ["#baffa3"], "ApEinfo_revcolor": ["#ffbaba"], }, ) self.template.features.append(sf) for rp in self.reverse_primers: if rp.position + rp._fp <= len(self.template): start = rp.position end = rp.position + rp._fp self.template.features.append( _SeqFeature( _FeatureLocation(start, end), type="primer_bind", strand=-1, qualifiers={ "label": [rp.name], "ApEinfo_fwdcolor": ["#baffa3"], "ApEinfo_revcolor": ["#ffbaba"], }, )) else: start = rp.position end = rp.position + rp._fp - len(self.template) self.template.features.append( _SeqFeature( _CompoundLocation([ _FeatureLocation(0, end), _FeatureLocation(start, len(self.template)), ]), type="primer_bind", location_operator="join", strand=-1, qualifiers={"label": [rp.name]}, ))
def map_trace_files(self, pth, limit=25): # TODO allow path-like objects import glob traces = [] for name in glob.glob(pth): trace = SeqIO.read(name, "abi").lower() trace.annotations["filename"] = trace.fname = name traces.append(trace) if not traces: raise ValueError("No trace files found in {}".format(pth)) if hasattr(self.map_target, "step"): area = self.map_target elif hasattr(self.map_target, "extract"): area = slice(self.map_target.location.start, self.map_target.location.end) else: area = None # TODO allow other objects as well and do some checks on map target if area: self.matching_reads = [] self.not_matching_reads = [] target = str(self[area].seq).lower() target_rc = str(self[area].seq.rc()).lower() for trace in traces: if target in str(trace.seq) or target_rc in str(trace.seq): self.matching_reads.append(trace) else: self.not_matching_reads.append(trace) reads = self.matching_reads else: self.matching_reads = None self.not_matching_reads = None reads = traces matching_reads = [] for read_ in reads: matches = _common_sub_strings( str(self.seq).lower(), str(read_.seq), limit) if not matches: continue if len(matches) > 1: newmatches = [ matches[0], ] for i, x in enumerate(matches[1:]): g, f, h = matches[i] if g + h < x[0] and f + h < x[1]: newmatches.append(x) else: # len(matches)==1 newmatches = matches matching_reads.append(read_) if len(newmatches) > 1: ms = [] for m in newmatches: ms.append(_FeatureLocation(m[0], m[0] + m[2])) loc = _CompoundLocation(ms) else: a, b, c = newmatches[0] loc = _FeatureLocation(a, a + c) self.features.append( _SeqFeature( loc, qualifiers={"label": [read_.annotations["filename"]]}, type="trace", )) return [x.annotations["filename"] for x in matching_reads]
def shifted(self, shift): """Returns a circular Dseqrecord with a new origin <shift>. This only works on circular Dseqrecords. If we consider the following circular sequence: | ``GAAAT <-- watson strand`` | ``CTTTA <-- crick strand`` The T and the G on the watson strand are linked together as well as the A and the C of the of the crick strand. if ``shift`` is 1, this indicates a new origin at position 1: | new origin at the | symbol: | | ``G|AAAT`` | ``C|TTTA`` new sequence: | ``AAATG`` | ``TTTAC`` Examples -------- >>> from pydna.dseqrecord import Dseqrecord >>> a=Dseqrecord("aaat",circular=True) >>> a Dseqrecord(o4) >>> a.seq Dseq(o4) aaat ttta >>> b=a.shifted(1) >>> b Dseqrecord(o4) >>> b.seq Dseq(o4) aata ttat """ if self.linear: raise TypeError( "Sequence is linear, origin can only be shifted for circular sequences.\n" ) ln = len(self) if not shift % ln: return self # shift is a multiple of ln or 0 else: shift %= ln # 0<=shift<=ln newseq = (self.seq[shift:] + self.seq[:shift]).looped() shift = ln - shift newfeatures = [] for feature in self.features: shiftedparts = [ featurelocation + shift for featurelocation in feature.location.parts ] zero_length_parts = [ featurelocation for featurelocation in shiftedparts if featurelocation.start == featurelocation.end ] newparts = [] for location in shiftedparts: newstart = location.start % ln newend = location.end % ln if newstart < newend: newparts.append( _FeatureLocation( newstart, newend, location.strand, location.ref, location.ref_db, )) elif newstart > newend: if location.strand == 1: newparts.extend([ _FeatureLocation( newstart, ln, location.strand, location.ref, location.ref_db, ), _FeatureLocation( 0, newend, location.strand, location.ref, location.ref_db, ), ]) else: newparts.extend([ _FeatureLocation( 0, newend, location.strand, location.ref, location.ref_db, ), _FeatureLocation( newstart, ln, location.strand, location.ref, location.ref_db, ), ]) p = next((p for p in newparts if p.end == shift), None) s = next((p for p in newparts if p.start == shift), None) if p and s: newparts.remove(p) newparts[newparts.index(s)] = _FeatureLocation( p.start, s.end, p.strand, p.ref, p.ref_db) newparts = [p for p in newparts if p] newparts.extend(zero_length_parts) if newparts: newfeatures.append( _SeqFeature( location=sum(newparts), type=feature.type, id=feature.id, qualifiers=feature.qualifiers, )) newfeatures.sort(key=_operator.attrgetter("location.start")) answer = _copy.copy(self) answer.features = newfeatures answer.seq = newseq return answer
def add_feature(self, x=None, y=None, seq=None, type="misc", strand=1, *args, **kwargs): # location=None, # type='', # location_operator='', # strand=None, # id="<unknown id>", # qualifiers=None, # sub_features=None, # ref=None, # ref_db=None '''Adds a feature of type misc to the feature list of the sequence. Parameters ---------- x : int Indicates start of the feature y : int Indicates end of the feature Examples -------- >>> from pydna.seqrecord import SeqRecord >>> a=SeqRecord("atgtaa") >>> a.features [] >>> a.add_feature(2,4) >>> a.features [SeqFeature(FeatureLocation(ExactPosition(2), ExactPosition(4), strand=1), type='misc')] ''' qualifiers = {} qualifiers.update(kwargs) if seq: if hasattr(seq, "seq"): seq = seq.seq if hasattr(seq, "watson"): seq = str(seq.watson).lower() else: seq = str(seq).lower() else: seq = str(seq).lower() x = self.seq.lower().find(seq) if x == -1: raise TypeError("Could not find {}".format(seq)) y = x + len(seq) else: x = x or 0 y = y or len(self) if "label" not in qualifiers: qualifiers["label"] = ["ft{}".format(y - x)] if self[x:y].isorf() or self[x:y].reverse_complement().isorf(): qualifiers["label"] = ["orf{}".format(y - x)] sf = _SeqFeature(_FeatureLocation(x, y, strand=strand), type=type, qualifiers=qualifiers) self.features.append(sf) '''
def __init__( self, primers, template, limit=13, primerc=1000.0, # nM saltc=50, # mM **kwargs): r'''The Anneal class has to be initiated with at least an iterable of primers and a template. Parameters ---------- primers : iterable of :class:`Primer` or Biopython SeqRecord like objects Primer sequences 5'-3'. template : Dseqrecord The template sequence 5'-3'. limit : int, optional limit length of the annealing part of the primers. fprimerc : float, optional Concentration of forward primer in nM, set to 1000.0 nM by default rprimerc : float, optional Concentration of reverse primer in nM, set to 1000.0 nM by default saltc : float, optional Salt concentration (monovalet cations) :mod:`tmbresluc` set to 50.0 mM by default Attributes ---------- products: list A list of Amplicon objects, one for each primer pair that may form a PCR product. Examples -------- >>> from pydna.readers import read >>> from pydna.amplify import Anneal >>> from pydna.dseqrecord import Dseqrecord >>> template = Dseqrecord("tacactcaccgtctatcattatctactatcgactgtatcatctgatagcac") >>> from Bio.SeqRecord import SeqRecord >>> p1 = read(">p1\ntacactcaccgtctatcattatc", ds = False) >>> p2 = read(">p2\ngtgctatcagatgatacagtcg", ds = False) >>> ann = Anneal((p1, p2), template) >>> print(ann.report()) Template name 51 nt linear: Primer p1 anneals forward at position 23 <BLANKLINE> Primer p2 anneals reverse at position 29 >>> ann.products [Amplicon(51)] >>> amplicon_list = ann.products >>> amplicon = amplicon_list.pop() >>> amplicon Amplicon(51) >>> print(amplicon.figure()) 5tacactcaccgtctatcattatc...cgactgtatcatctgatagcac3 |||||||||||||||||||||| tm 55.9 (dbd) 60.5 3gctgacatagtagactatcgtg5 5tacactcaccgtctatcattatc3 ||||||||||||||||||||||| tm 54.6 (dbd) 58.8 3atgtgagtggcagatagtaatag...gctgacatagtagactatcgtg5 >>> amplicon.annotations['date'] = '02-FEB-2013' # Set the date for this example to pass the doctest >>> print(amplicon) Dseqrecord circular: False size: 51 ID: 51bp U96-TO06Y6pFs74SQx8M1IVTBiY Name: 51bp_PCR_prod Description: pcr product_p1_p2 Number of features: 2 /date=02-FEB-2013 Dseq(-51) taca..gcac atgt..cgtg >>> print(amplicon.program()) <BLANKLINE> Taq (rate 30 nt/s) 35 cycles |51bp 95.0°C |95.0°C | |Tm formula: Biopython Tm_NN |_________|_____ 72.0°C |72.0°C|SaltC 50mM | 03min00s|30s \ ________|______|Primer1C 1.0µM | | \ 45.4°C/ 0min 2s| 5min |Primer2C 1.0µM | | \_____/ | |GC 39% | | 30s | |4-12°C >>> ''' self.primers = primers self.primerc = primerc self.saltc = saltc self.template = _copy.deepcopy(template) self.limit = limit self.kwargs = defaultdict(str, kwargs) self._products = None self.forward_primers = [] self.reverse_primers = [] twl = len(self.template.seq.watson) tcl = len(self.template.seq.crick) if self.template.linear: tw = self.template.seq.watson tc = self.template.seq.crick else: tw = self.template.seq.watson + self.template.seq.watson tc = self.template.seq.crick + self.template.seq.crick for p in self.primers: self.forward_primers.extend(( _Primer(p, position=tcl - pos - min(self.template.seq.ovhg, 0), footprint=fp) for pos, fp in _annealing_positions(str(p.seq), tc, self.limit) if pos < tcl)) self.reverse_primers.extend(( _Primer(p, position=pos + max(0, self.template.seq.ovhg), footprint=fp) for pos, fp in _annealing_positions(str(p.seq), tw, self.limit) if pos < twl)) self.forward_primers.sort(key=_operator.attrgetter('position')) self.reverse_primers.sort(key=_operator.attrgetter('position'), reverse=True) for fp in self.forward_primers: if fp.position - fp._fp >= 0: start = fp.position - fp._fp end = fp.position self.template.features.append( _SeqFeature(_FeatureLocation(start, end), type="primer_bind", strand=1, qualifiers={ "label": [fp.name], "ApEinfo_fwdcolor": ["#baffa3"], "ApEinfo_revcolor": ["#ffbaba"] })) else: start = len(self.template) - fp._fp + fp.position end = start + fp._fp - len(self.template) sf = _SeqFeature(_CompoundLocation([ _FeatureLocation(start, len(self.template)), _FeatureLocation(0, end) ]), type="primer_bind", location_operator="join", qualifiers={ "label": [fp.name], "ApEinfo_fwdcolor": ["#baffa3"], "ApEinfo_revcolor": ["#ffbaba"] }) self.template.features.append(sf) for rp in self.reverse_primers: if rp.position + rp._fp <= len(self.template): start = rp.position end = rp.position + rp._fp self.template.features.append( _SeqFeature(_FeatureLocation(start, end), type="primer_bind", strand=-1, qualifiers={ "label": [rp.name], "ApEinfo_fwdcolor": ["#baffa3"], "ApEinfo_revcolor": ["#ffbaba"] })) else: start = rp.position end = rp.position + rp._fp - len(self.template) self.template.features.append( _SeqFeature(_CompoundLocation([ _FeatureLocation(0, end), _FeatureLocation(start, len(self.template)) ]), type="primer_bind", location_operator="join", strand=-1, qualifiers={"label": [rp.name]}))