Exemplo n.º 1
0
def fill_gaps(ppr):
    """Look for reluctant motifs in suspiciously sized gaps"""

    gaps = find_gaps(ppr, mingap=3 * 30, maxgap=3 * 40)
    #check if there could be a motif at the start
    if ppr.features[0].location.start > 3 * 30:
        gaps.append(FeatureLocation(0, ppr.features[0].location.start, 1))

    for g in gaps:
        record = ppr[g.start - 15:g.end + 15]
        record.seq = record.seq.translate()

        #increase the reporting thresholds
        search = HMMER.hmmsearch(hmm=models[3],
                                 targets=record,
                                 F1=0.5,
                                 F2=0.5,
                                 F3=0.5)

        if search.matches:
            motif = search.getFeatures(record)[0]
            offset = g.start - 15
            motif.location = FeatureLocation(offset + 3 * motif.location.start,
                                             offset + 3 * motif.location.end,
                                             strand=1)
            ppr.features.append(motif)
    ppr.features.sort(key=lambda p: p.location.start)
    return ppr
Exemplo n.º 2
0
def test_search_results(domE=10.0, verbose=False):
	known = get_known()[0]
	for k in known:
		k.hit = 0

	ara = SeqIO.parse(ARA_FILE, 'fasta').next()
	
	search = HMMER.hmmsearch(hmm = extract.models[3], targets = ara, domE=domE)
	found = search.getFeatures(ara)

	for f in found:
		p = int((f.location.start + f.location.end) / 2)
		for k in known:
			if p in k:
				k.hit += 1
				break
	
	hits = [j.hit for j in known]
	ret = {'matches': len(found),
				 'correct_matches': sum(hits),
				 'hit_pprs': sum([1 for i in hits if i > 0]),
				 'total_pprs':	len(known),
				 'hits': hits,}
	if verbose:
		print "Found {} HMM domains, {} within known PPRs".format(
				ret['matches'], ret['correct_matches'])
		print "Hits in {} of {} PPR proteins".format(
				ret['hit_pprs'], ret['total_pprs'])
		print "Hits per PPR: min: {} / max: {} / avg: {}".format(
				min(hits), max(hits), float(sum(hits)) / float(len(hits)))

	return ret
Exemplo n.º 3
0
def test_search_results(domE=10.0, verbose=False):
    known = get_known()[0]
    for k in known:
        k.hit = 0

    ara = SeqIO.parse(ARA_FILE, 'fasta').next()

    search = HMMER.hmmsearch(hmm=extract.models[3], targets=ara, domE=domE)
    found = search.getFeatures(ara)

    for f in found:
        p = int((f.location.start + f.location.end) / 2)
        for k in known:
            if p in k:
                k.hit += 1
                break

    hits = [j.hit for j in known]
    ret = {
        'matches': len(found),
        'correct_matches': sum(hits),
        'hit_pprs': sum([1 for i in hits if i > 0]),
        'total_pprs': len(known),
        'hits': hits,
    }
    if verbose:
        print "Found {} HMM domains, {} within known PPRs".format(
            ret['matches'], ret['correct_matches'])
        print "Hits in {} of {} PPR proteins".format(ret['hit_pprs'],
                                                     ret['total_pprs'])
        print "Hits per PPR: min: {} / max: {} / avg: {}".format(
            min(hits), max(hits),
            float(sum(hits)) / float(len(hits)))

    return ret
Exemplo n.º 4
0
def classify(pprs, family_annot="ppr_family", tail_annot='ppr_tail'):
	"""Annotate each ppr with it's family type, (P,PLS,E,E+,DYW)"""
	ct = get_c_terminus(pprs)
	(E, Ep, DYW) = utils.get_tail_models()

	h = HMMER.hmmsearch([E,Ep,DYW], ct)

	#annotate each tail
	h.annotate(ct)

	for ppr,tail in zip(pprs, ct):
		fmt = ''
		if tail.features:
			f = sorted(tail.features, key=lambda(ft): int(ft.location.start))
			fmt = ("-{.type}"*len(f)).format(*f)
			if fmt[-3:] == 'DYW':
				ppr.annotations[family_annot] = 'DYW'
			elif fmt.find('E+') >= 0:
				ppr.annotations[family_annot] = 'E+'
			elif fmt.find('E') >= 0:
				ppr.annotations[family_annot] = 'E'
			else:
				print "Unknown tail format \'{}\'".format(fmt)
				ppr.annotations[family_annot] = '??'

		else:
			l = len(ppr.features[0])
			for f in ppr.features:
				if len(f) != l:
					ppr.annotations[family_annot]='PLS'
					continue
			ppr.annotations[family_annot] = 'P'

		ppr.annotations[tail_annot] = fmt
Exemplo n.º 5
0
def locate_ppr(envelope):
    """Find and annotate the protein within"""
    #find all the PPR motifs
    search = HMMER.hmmsearch(hmm=models[3], targets=envelope)
    motifs = search.getFeatures(envelope)

    #A ppr must contain 2 or more PPR motifs
    if len(motifs) < 2:
        return None

    #order the motifs
    motifs.sort(key=lambda m: m.location.start)
    known_start = True
    known_stop = True
    #find start codon
    start = motifs[0].location.start
    while start > 0 and str(envelope.seq[start:start + 3]).lower() != "atg":
        start -= 3
    if start < 0:
        known_start = False
        start = 0

    #find stop codon
    stop = motifs[-1].location.end
    while stop < len(envelope) and (str(envelope.seq[stop:stop + 3]).lower()
                                    not in ["tag", "tga", "taa"]):
        stop += 3
    if stop > len(envelope):
        known_stop = False
        stop = len(envelope)

    #move the motifs
    for m in motifs:
        m.location = FeatureLocation(m.location.start - start,
                                     m.location.end - start)

    #get absolute start and end
    if envelope.annotations['src_strand'] > 0:
        src_from = envelope.annotations['src_from'] + start
        src_to = envelope.annotations['src_from'] + stop
    else:
        src_from = envelope.annotations['src_to'] - stop
        src_to = envelope.annotations['src_to'] - start

    annotations = {
        "src_from": src_from,
        "src_to": src_to,
        "src_strand": envelope.annotations['src_strand'],
    }
    if not known_stop:
        annotations['no_stop'] = True
    if not known_start:
        annotations['no_start'] = True

    #return a record
    return SeqRecord(envelope.seq[start:stop],
                     features=motifs,
                     annotations=annotations)
Exemplo n.º 6
0
def locate_ppr(envelope):
	"""Find and annotate the protein within"""
	#find all the PPR motifs
	search = HMMER.hmmsearch(hmm = models[3], targets = envelope)
	motifs = search.getFeatures(envelope)

	#A ppr must contain 2 or more PPR motifs
	if len(motifs) < 2:
		return None
	
	#order the motifs
	motifs.sort(key=lambda m: m.location.start)
	known_start = True
	known_stop = True
	#find start codon
	start = motifs[0].location.start
	while start > 0 and str(envelope.seq[start:start+3]).lower() != "atg":
		start -= 3
	if start < 0:
		known_start = False
		start = 0

	#find stop codon
	stop = motifs[-1].location.end
	while stop < len(envelope) and (
		str(envelope.seq[stop:stop+3]).lower() not in ["tag", "tga", "taa"]):
		stop += 3
	if stop > len(envelope):
		known_stop = False
		stop = len(envelope)

	#move the motifs
	for m in motifs:
		m.location = FeatureLocation(m.location.start-start, m.location.end-start)
	

	#get absolute start and end
	if envelope.annotations['src_strand'] > 0:
		src_from = envelope.annotations['src_from'] + start
		src_to   = envelope.annotations['src_from'] + stop
	else:
		src_from = envelope.annotations['src_to'] - stop
		src_to   = envelope.annotations['src_to'] - start

	annotations = {
				"src_from"	: src_from,
				"src_to"		: src_to,
				"src_strand": envelope.annotations['src_strand'],
	}
	if not known_stop:
		annotations['no_stop'] = True
	if not known_start:
		annotations['no_start'] = True

	#return a record
	return SeqRecord(envelope.seq[start:stop],
			features = motifs,
			annotations = annotations)
Exemplo n.º 7
0
	def test_annotation(self):
		#load the hmm
		hmm = hmmfile.read('tests/data/valid.hmm')

		s1 = SeqIO.read('tests/data/PPR10.gb', 'genbank')
		s1.features = []
		s2 = SeqIO.read('tests/data/PPR10.gb', 'genbank')
		s2.features = []
		s2.seq = s2.seq.reverse_complement()
		

		h1 = HMMER.hmmsearch(hmm, s1)
		h2 = HMMER.hmmsearch(hmm, s2)
		h1.annotate(s1)
		h2.annotate(s2)

		for (f1, f2) in zip(s1.features, s2.features):
			self.assertEqual(str(f1.extract(s1.seq)), str(f2.extract(s2.seq)))
Exemplo n.º 8
0
	def test_translation_search(self):
		t = SeqIO.parse('tests/data/dna_target.fasta', 'fasta',
					alphabet=Alphabet.generic_dna)
		h = HMMER.hmmsearch('tests/data/valid.hmm', t)
		self.assertEqual(len(h.matches), 17)
		check_valid(self, h.matches)

		#check that matches all have frame 1
		for m in h.matches:
			self.assertEqual(m.getFrame(), 1)

		#test feature extraction
		for m in h.matches:
			f = m.asSeqFeature()
			self.assertEqual(f.qualifiers['frame'], 1)
Exemplo n.º 9
0
def simple_extract(target, localization=None, verbose=False):
    """Extract all the PPRs found in target"""
    if not isinstance(target, SeqRecord):
        raise TypeError(
            "simple_extract requires a Bio.SeqRecord, not {}".format(
                type(target)))

    if verbose:
        print "Searching..."
    #find all easy-to-locate PPR motifs
    search = HMMER.hmmsearch(hmm=models[3], targets=target)

    #get features for each motif
    motifs = search.getFeatures(target)

    if verbose:
        print "Got {} motifs, grouping...".format(len(motifs))
    #group features by frame and locatiion
    groups = group_motifs(motifs, max_gap=1500)
    if verbose:
        print "Got {} groups, extracting envelopes...".format(len(groups))

    pprs = []
    dbg_env = []

    while groups:
        if verbose:
            print "Got {} groups, extracting envelopes...".format(len(groups))
        #extract the sequence envelope around each group
        envelopes = [
            get_envelope(group, target, margin=1000) for group in groups
        ]
        dbg_env += envelopes

        if verbose:
            print "Got {} envelopes, locating PPRs...".format(len(envelopes))
        #locate the PPR within each envelope
        for envelope in envelopes:
            ppr = locate_ppr(envelope)
            if ppr:
                pprs.append(ppr)

        #look for overlapping pprs
        groups = remove_overlaps(pprs)
        ol = len(groups)
        if verbose:
            print "{} conflicts".format(ol)
        groups += remove_overgrown(pprs, 500)
        if verbose:
            print "{} overgrown PPRs".format(len(groups) - ol)

    pprs = [add_source(p, target) for p in pprs]

    if verbose:
        print "Got {} PPRs, cleaning...".format(len(pprs))
    #clean the gaps between features
    pprs = [clean_ends(fill_gaps(ppr)) for ppr in pprs]

    #annotate the tail region and classify each PPR
    classify.classify(pprs)

    #predict each PPR's target
    targetp.targetp(pprs, annotation='localization')

    #filter the desired location
    if localization:
        pprs = [
            p for p in pprs if p.annotations['localization'] == localization
        ]

    #return a list of nicely presented PPRs
    return pprs
Exemplo n.º 10
0
	def test_search(self):
		t = SeqIO.parse('tests/data/matchtarget.fasta', 'fasta',
				alphabet=Alphabet.generic_protein)
		h = HMMER.hmmsearch('tests/data/valid.hmm', t)
		self.assertEqual(len(h.matches), 17)
		check_valid(self, h.matches)
Exemplo n.º 11
0
	def setUp(self):
		#build a fake search
		self.hs = HMMER.hmmsearch()
		self.hs.hmms = [self.hmm,]
		self.hs.targets = [self.target,]
Exemplo n.º 12
0
def simple_extract(target, localization = None, verbose=False):
	"""Extract all the PPRs found in target"""
	if not isinstance(target, SeqRecord):
		raise TypeError("simple_extract requires a Bio.SeqRecord, not {}".format(
			type(target)))

	if verbose:
		print "Searching..."
	#find all easy-to-locate PPR motifs
	search = HMMER.hmmsearch(hmm = models[3], targets = target)

	#get features for each motif
	motifs = search.getFeatures(target)

	if verbose:
		print "Got {} motifs, grouping...".format(len(motifs))
	#group features by frame and locatiion
	groups = group_motifs(motifs, max_gap=1500)
	if verbose:
		print "Got {} groups, extracting envelopes...".format(len(groups))

	pprs = []
	dbg_env = []
	
	while groups:
		if verbose:
			print "Got {} groups, extracting envelopes...".format(len(groups))
		#extract the sequence envelope around each group
		envelopes = [get_envelope(group, target, margin=1000) for group in groups]
		dbg_env += envelopes

		if verbose:
			print "Got {} envelopes, locating PPRs...".format(len(envelopes))
		#locate the PPR within each envelope
		for envelope in envelopes:
			ppr = locate_ppr(envelope)
			if ppr:
				pprs.append(ppr)

		#look for overlapping pprs
		groups = remove_overlaps(pprs)
		ol = len(groups)
		if verbose:
			print "{} conflicts".format(ol)
		groups += remove_overgrown(pprs, 500)
		if verbose:
			print "{} overgrown PPRs".format(len(groups) - ol)
	
	pprs = [add_source(p, target) for p in pprs]
	
	if verbose:
		print "Got {} PPRs, cleaning...".format(len(pprs))
	#clean the gaps between features
	pprs = [clean_ends(fill_gaps(ppr)) for ppr in pprs]

	#annotate the tail region and classify each PPR
	classify.classify(pprs)

	#predict each PPR's target
	targetp.targetp(pprs, annotation='localization')

	#filter the desired location
	if localization:
		pprs = [p for p in pprs if p.annotations['localization'] == localization]

	#return a list of nicely presented PPRs
	return pprs
Exemplo n.º 13
-2
def fill_gaps(ppr):
	"""Look for reluctant motifs in suspiciously sized gaps"""

	gaps = find_gaps(ppr, mingap=3*30, maxgap=3*40)	
	#check if there could be a motif at the start
	if ppr.features[0].location.start > 3*30:
		gaps.append(FeatureLocation(0, ppr.features[0].location.start, 1))

	for g in gaps:
		record = ppr[g.start-15:g.end+15]
		record.seq = record.seq.translate()

		#increase the reporting thresholds
		search = HMMER.hmmsearch(hmm = models[3], targets = record, 
				F1=0.5,F2=0.5,F3=0.5)
		
		if search.matches:
			motif = search.getFeatures(record)[0]
			offset = g.start-15
			motif.location = FeatureLocation(
					offset + 3*motif.location.start,
					offset + 3*motif.location.end,
					strand = 1)
			ppr.features.append(motif)
	ppr.features.sort(key=lambda p: p.location.start)
	return ppr