def fill_gaps(ppr): """Look for reluctant motifs in suspiciously sized gaps""" gaps = find_gaps(ppr, mingap=3 * 30, maxgap=3 * 40) #check if there could be a motif at the start if ppr.features[0].location.start > 3 * 30: gaps.append(FeatureLocation(0, ppr.features[0].location.start, 1)) for g in gaps: record = ppr[g.start - 15:g.end + 15] record.seq = record.seq.translate() #increase the reporting thresholds search = HMMER.hmmsearch(hmm=models[3], targets=record, F1=0.5, F2=0.5, F3=0.5) if search.matches: motif = search.getFeatures(record)[0] offset = g.start - 15 motif.location = FeatureLocation(offset + 3 * motif.location.start, offset + 3 * motif.location.end, strand=1) ppr.features.append(motif) ppr.features.sort(key=lambda p: p.location.start) return ppr
def test_search_results(domE=10.0, verbose=False): known = get_known()[0] for k in known: k.hit = 0 ara = SeqIO.parse(ARA_FILE, 'fasta').next() search = HMMER.hmmsearch(hmm = extract.models[3], targets = ara, domE=domE) found = search.getFeatures(ara) for f in found: p = int((f.location.start + f.location.end) / 2) for k in known: if p in k: k.hit += 1 break hits = [j.hit for j in known] ret = {'matches': len(found), 'correct_matches': sum(hits), 'hit_pprs': sum([1 for i in hits if i > 0]), 'total_pprs': len(known), 'hits': hits,} if verbose: print "Found {} HMM domains, {} within known PPRs".format( ret['matches'], ret['correct_matches']) print "Hits in {} of {} PPR proteins".format( ret['hit_pprs'], ret['total_pprs']) print "Hits per PPR: min: {} / max: {} / avg: {}".format( min(hits), max(hits), float(sum(hits)) / float(len(hits))) return ret
def test_search_results(domE=10.0, verbose=False): known = get_known()[0] for k in known: k.hit = 0 ara = SeqIO.parse(ARA_FILE, 'fasta').next() search = HMMER.hmmsearch(hmm=extract.models[3], targets=ara, domE=domE) found = search.getFeatures(ara) for f in found: p = int((f.location.start + f.location.end) / 2) for k in known: if p in k: k.hit += 1 break hits = [j.hit for j in known] ret = { 'matches': len(found), 'correct_matches': sum(hits), 'hit_pprs': sum([1 for i in hits if i > 0]), 'total_pprs': len(known), 'hits': hits, } if verbose: print "Found {} HMM domains, {} within known PPRs".format( ret['matches'], ret['correct_matches']) print "Hits in {} of {} PPR proteins".format(ret['hit_pprs'], ret['total_pprs']) print "Hits per PPR: min: {} / max: {} / avg: {}".format( min(hits), max(hits), float(sum(hits)) / float(len(hits))) return ret
def classify(pprs, family_annot="ppr_family", tail_annot='ppr_tail'): """Annotate each ppr with it's family type, (P,PLS,E,E+,DYW)""" ct = get_c_terminus(pprs) (E, Ep, DYW) = utils.get_tail_models() h = HMMER.hmmsearch([E,Ep,DYW], ct) #annotate each tail h.annotate(ct) for ppr,tail in zip(pprs, ct): fmt = '' if tail.features: f = sorted(tail.features, key=lambda(ft): int(ft.location.start)) fmt = ("-{.type}"*len(f)).format(*f) if fmt[-3:] == 'DYW': ppr.annotations[family_annot] = 'DYW' elif fmt.find('E+') >= 0: ppr.annotations[family_annot] = 'E+' elif fmt.find('E') >= 0: ppr.annotations[family_annot] = 'E' else: print "Unknown tail format \'{}\'".format(fmt) ppr.annotations[family_annot] = '??' else: l = len(ppr.features[0]) for f in ppr.features: if len(f) != l: ppr.annotations[family_annot]='PLS' continue ppr.annotations[family_annot] = 'P' ppr.annotations[tail_annot] = fmt
def locate_ppr(envelope): """Find and annotate the protein within""" #find all the PPR motifs search = HMMER.hmmsearch(hmm=models[3], targets=envelope) motifs = search.getFeatures(envelope) #A ppr must contain 2 or more PPR motifs if len(motifs) < 2: return None #order the motifs motifs.sort(key=lambda m: m.location.start) known_start = True known_stop = True #find start codon start = motifs[0].location.start while start > 0 and str(envelope.seq[start:start + 3]).lower() != "atg": start -= 3 if start < 0: known_start = False start = 0 #find stop codon stop = motifs[-1].location.end while stop < len(envelope) and (str(envelope.seq[stop:stop + 3]).lower() not in ["tag", "tga", "taa"]): stop += 3 if stop > len(envelope): known_stop = False stop = len(envelope) #move the motifs for m in motifs: m.location = FeatureLocation(m.location.start - start, m.location.end - start) #get absolute start and end if envelope.annotations['src_strand'] > 0: src_from = envelope.annotations['src_from'] + start src_to = envelope.annotations['src_from'] + stop else: src_from = envelope.annotations['src_to'] - stop src_to = envelope.annotations['src_to'] - start annotations = { "src_from": src_from, "src_to": src_to, "src_strand": envelope.annotations['src_strand'], } if not known_stop: annotations['no_stop'] = True if not known_start: annotations['no_start'] = True #return a record return SeqRecord(envelope.seq[start:stop], features=motifs, annotations=annotations)
def locate_ppr(envelope): """Find and annotate the protein within""" #find all the PPR motifs search = HMMER.hmmsearch(hmm = models[3], targets = envelope) motifs = search.getFeatures(envelope) #A ppr must contain 2 or more PPR motifs if len(motifs) < 2: return None #order the motifs motifs.sort(key=lambda m: m.location.start) known_start = True known_stop = True #find start codon start = motifs[0].location.start while start > 0 and str(envelope.seq[start:start+3]).lower() != "atg": start -= 3 if start < 0: known_start = False start = 0 #find stop codon stop = motifs[-1].location.end while stop < len(envelope) and ( str(envelope.seq[stop:stop+3]).lower() not in ["tag", "tga", "taa"]): stop += 3 if stop > len(envelope): known_stop = False stop = len(envelope) #move the motifs for m in motifs: m.location = FeatureLocation(m.location.start-start, m.location.end-start) #get absolute start and end if envelope.annotations['src_strand'] > 0: src_from = envelope.annotations['src_from'] + start src_to = envelope.annotations['src_from'] + stop else: src_from = envelope.annotations['src_to'] - stop src_to = envelope.annotations['src_to'] - start annotations = { "src_from" : src_from, "src_to" : src_to, "src_strand": envelope.annotations['src_strand'], } if not known_stop: annotations['no_stop'] = True if not known_start: annotations['no_start'] = True #return a record return SeqRecord(envelope.seq[start:stop], features = motifs, annotations = annotations)
def test_annotation(self): #load the hmm hmm = hmmfile.read('tests/data/valid.hmm') s1 = SeqIO.read('tests/data/PPR10.gb', 'genbank') s1.features = [] s2 = SeqIO.read('tests/data/PPR10.gb', 'genbank') s2.features = [] s2.seq = s2.seq.reverse_complement() h1 = HMMER.hmmsearch(hmm, s1) h2 = HMMER.hmmsearch(hmm, s2) h1.annotate(s1) h2.annotate(s2) for (f1, f2) in zip(s1.features, s2.features): self.assertEqual(str(f1.extract(s1.seq)), str(f2.extract(s2.seq)))
def test_translation_search(self): t = SeqIO.parse('tests/data/dna_target.fasta', 'fasta', alphabet=Alphabet.generic_dna) h = HMMER.hmmsearch('tests/data/valid.hmm', t) self.assertEqual(len(h.matches), 17) check_valid(self, h.matches) #check that matches all have frame 1 for m in h.matches: self.assertEqual(m.getFrame(), 1) #test feature extraction for m in h.matches: f = m.asSeqFeature() self.assertEqual(f.qualifiers['frame'], 1)
def simple_extract(target, localization=None, verbose=False): """Extract all the PPRs found in target""" if not isinstance(target, SeqRecord): raise TypeError( "simple_extract requires a Bio.SeqRecord, not {}".format( type(target))) if verbose: print "Searching..." #find all easy-to-locate PPR motifs search = HMMER.hmmsearch(hmm=models[3], targets=target) #get features for each motif motifs = search.getFeatures(target) if verbose: print "Got {} motifs, grouping...".format(len(motifs)) #group features by frame and locatiion groups = group_motifs(motifs, max_gap=1500) if verbose: print "Got {} groups, extracting envelopes...".format(len(groups)) pprs = [] dbg_env = [] while groups: if verbose: print "Got {} groups, extracting envelopes...".format(len(groups)) #extract the sequence envelope around each group envelopes = [ get_envelope(group, target, margin=1000) for group in groups ] dbg_env += envelopes if verbose: print "Got {} envelopes, locating PPRs...".format(len(envelopes)) #locate the PPR within each envelope for envelope in envelopes: ppr = locate_ppr(envelope) if ppr: pprs.append(ppr) #look for overlapping pprs groups = remove_overlaps(pprs) ol = len(groups) if verbose: print "{} conflicts".format(ol) groups += remove_overgrown(pprs, 500) if verbose: print "{} overgrown PPRs".format(len(groups) - ol) pprs = [add_source(p, target) for p in pprs] if verbose: print "Got {} PPRs, cleaning...".format(len(pprs)) #clean the gaps between features pprs = [clean_ends(fill_gaps(ppr)) for ppr in pprs] #annotate the tail region and classify each PPR classify.classify(pprs) #predict each PPR's target targetp.targetp(pprs, annotation='localization') #filter the desired location if localization: pprs = [ p for p in pprs if p.annotations['localization'] == localization ] #return a list of nicely presented PPRs return pprs
def test_search(self): t = SeqIO.parse('tests/data/matchtarget.fasta', 'fasta', alphabet=Alphabet.generic_protein) h = HMMER.hmmsearch('tests/data/valid.hmm', t) self.assertEqual(len(h.matches), 17) check_valid(self, h.matches)
def setUp(self): #build a fake search self.hs = HMMER.hmmsearch() self.hs.hmms = [self.hmm,] self.hs.targets = [self.target,]
def simple_extract(target, localization = None, verbose=False): """Extract all the PPRs found in target""" if not isinstance(target, SeqRecord): raise TypeError("simple_extract requires a Bio.SeqRecord, not {}".format( type(target))) if verbose: print "Searching..." #find all easy-to-locate PPR motifs search = HMMER.hmmsearch(hmm = models[3], targets = target) #get features for each motif motifs = search.getFeatures(target) if verbose: print "Got {} motifs, grouping...".format(len(motifs)) #group features by frame and locatiion groups = group_motifs(motifs, max_gap=1500) if verbose: print "Got {} groups, extracting envelopes...".format(len(groups)) pprs = [] dbg_env = [] while groups: if verbose: print "Got {} groups, extracting envelopes...".format(len(groups)) #extract the sequence envelope around each group envelopes = [get_envelope(group, target, margin=1000) for group in groups] dbg_env += envelopes if verbose: print "Got {} envelopes, locating PPRs...".format(len(envelopes)) #locate the PPR within each envelope for envelope in envelopes: ppr = locate_ppr(envelope) if ppr: pprs.append(ppr) #look for overlapping pprs groups = remove_overlaps(pprs) ol = len(groups) if verbose: print "{} conflicts".format(ol) groups += remove_overgrown(pprs, 500) if verbose: print "{} overgrown PPRs".format(len(groups) - ol) pprs = [add_source(p, target) for p in pprs] if verbose: print "Got {} PPRs, cleaning...".format(len(pprs)) #clean the gaps between features pprs = [clean_ends(fill_gaps(ppr)) for ppr in pprs] #annotate the tail region and classify each PPR classify.classify(pprs) #predict each PPR's target targetp.targetp(pprs, annotation='localization') #filter the desired location if localization: pprs = [p for p in pprs if p.annotations['localization'] == localization] #return a list of nicely presented PPRs return pprs
def fill_gaps(ppr): """Look for reluctant motifs in suspiciously sized gaps""" gaps = find_gaps(ppr, mingap=3*30, maxgap=3*40) #check if there could be a motif at the start if ppr.features[0].location.start > 3*30: gaps.append(FeatureLocation(0, ppr.features[0].location.start, 1)) for g in gaps: record = ppr[g.start-15:g.end+15] record.seq = record.seq.translate() #increase the reporting thresholds search = HMMER.hmmsearch(hmm = models[3], targets = record, F1=0.5,F2=0.5,F3=0.5) if search.matches: motif = search.getFeatures(record)[0] offset = g.start-15 motif.location = FeatureLocation( offset + 3*motif.location.start, offset + 3*motif.location.end, strand = 1) ppr.features.append(motif) ppr.features.sort(key=lambda p: p.location.start) return ppr