예제 #1
	def test_jackhmmer(self):
		seq = SeqIO.read('tests/data/jackhmmer_seq.fasta', 'fasta', 
		seqdb = SeqIO.read('tests/data/matchtarget.fasta', 'fasta',

		j = HMMER.jackhmmer(seq, seqdb)

		#load the expected output
		seq_ = HMMER.wrap_seqrecords([seq,])
		seqdb_ = HMMER.wrap_seqrecords([seqdb,])
		m = matchfile.load('tests/data/jack_out', seq_, seqdb_)

		self.assertEqual(m, j.matches)
예제 #2
def fill_gaps(ppr):
    """Look for reluctant motifs in suspiciously sized gaps"""

    gaps = find_gaps(ppr, mingap=3 * 30, maxgap=3 * 40)
    #check if there could be a motif at the start
    if ppr.features[0].location.start > 3 * 30:
        gaps.append(FeatureLocation(0, ppr.features[0].location.start, 1))

    for g in gaps:
        record = ppr[g.start - 15:g.end + 15]
        record.seq = record.seq.translate()

        #increase the reporting thresholds
        search = HMMER.hmmsearch(hmm=models[3],

        if search.matches:
            motif = search.getFeatures(record)[0]
            offset = g.start - 15
            motif.location = FeatureLocation(offset + 3 * motif.location.start,
                                             offset + 3 * motif.location.end,
    ppr.features.sort(key=lambda p: p.location.start)
    return ppr
예제 #3
def test_search_results(domE=10.0, verbose=False):
	known = get_known()[0]
	for k in known:
		k.hit = 0

	ara = SeqIO.parse(ARA_FILE, 'fasta').next()
	search = HMMER.hmmsearch(hmm = extract.models[3], targets = ara, domE=domE)
	found = search.getFeatures(ara)

	for f in found:
		p = int((f.location.start + f.location.end) / 2)
		for k in known:
			if p in k:
				k.hit += 1
	hits = [j.hit for j in known]
	ret = {'matches': len(found),
				 'correct_matches': sum(hits),
				 'hit_pprs': sum([1 for i in hits if i > 0]),
				 'total_pprs':	len(known),
				 'hits': hits,}
	if verbose:
		print "Found {} HMM domains, {} within known PPRs".format(
				ret['matches'], ret['correct_matches'])
		print "Hits in {} of {} PPR proteins".format(
				ret['hit_pprs'], ret['total_pprs'])
		print "Hits per PPR: min: {} / max: {} / avg: {}".format(
				min(hits), max(hits), float(sum(hits)) / float(len(hits)))

	return ret
예제 #4
def test_search_results(domE=10.0, verbose=False):
    known = get_known()[0]
    for k in known:
        k.hit = 0

    ara = SeqIO.parse(ARA_FILE, 'fasta').next()

    search = HMMER.hmmsearch(hmm=extract.models[3], targets=ara, domE=domE)
    found = search.getFeatures(ara)

    for f in found:
        p = int((f.location.start + f.location.end) / 2)
        for k in known:
            if p in k:
                k.hit += 1

    hits = [j.hit for j in known]
    ret = {
        'matches': len(found),
        'correct_matches': sum(hits),
        'hit_pprs': sum([1 for i in hits if i > 0]),
        'total_pprs': len(known),
        'hits': hits,
    if verbose:
        print "Found {} HMM domains, {} within known PPRs".format(
            ret['matches'], ret['correct_matches'])
        print "Hits in {} of {} PPR proteins".format(ret['hit_pprs'],
        print "Hits per PPR: min: {} / max: {} / avg: {}".format(
            min(hits), max(hits),
            float(sum(hits)) / float(len(hits)))

    return ret
예제 #5
def classify(pprs, family_annot="ppr_family", tail_annot='ppr_tail'):
	"""Annotate each ppr with it's family type, (P,PLS,E,E+,DYW)"""
	ct = get_c_terminus(pprs)
	(E, Ep, DYW) = utils.get_tail_models()

	h = HMMER.hmmsearch([E,Ep,DYW], ct)

	#annotate each tail

	for ppr,tail in zip(pprs, ct):
		fmt = ''
		if tail.features:
			f = sorted(tail.features, key=lambda(ft): int(ft.location.start))
			fmt = ("-{.type}"*len(f)).format(*f)
			if fmt[-3:] == 'DYW':
				ppr.annotations[family_annot] = 'DYW'
			elif fmt.find('E+') >= 0:
				ppr.annotations[family_annot] = 'E+'
			elif fmt.find('E') >= 0:
				ppr.annotations[family_annot] = 'E'
				print "Unknown tail format \'{}\'".format(fmt)
				ppr.annotations[family_annot] = '??'

			l = len(ppr.features[0])
			for f in ppr.features:
				if len(f) != l:
			ppr.annotations[family_annot] = 'P'

		ppr.annotations[tail_annot] = fmt
예제 #6
def locate_ppr(envelope):
	"""Find and annotate the protein within"""
	#find all the PPR motifs
	search = HMMER.hmmsearch(hmm = models[3], targets = envelope)
	motifs = search.getFeatures(envelope)

	#A ppr must contain 2 or more PPR motifs
	if len(motifs) < 2:
		return None
	#order the motifs
	motifs.sort(key=lambda m: m.location.start)
	known_start = True
	known_stop = True
	#find start codon
	start = motifs[0].location.start
	while start > 0 and str(envelope.seq[start:start+3]).lower() != "atg":
		start -= 3
	if start < 0:
		known_start = False
		start = 0

	#find stop codon
	stop = motifs[-1].location.end
	while stop < len(envelope) and (
		str(envelope.seq[stop:stop+3]).lower() not in ["tag", "tga", "taa"]):
		stop += 3
	if stop > len(envelope):
		known_stop = False
		stop = len(envelope)

	#move the motifs
	for m in motifs:
		m.location = FeatureLocation(m.location.start-start, m.location.end-start)

	#get absolute start and end
	if envelope.annotations['src_strand'] > 0:
		src_from = envelope.annotations['src_from'] + start
		src_to   = envelope.annotations['src_from'] + stop
		src_from = envelope.annotations['src_to'] - stop
		src_to   = envelope.annotations['src_to'] - start

	annotations = {
				"src_from"	: src_from,
				"src_to"		: src_to,
				"src_strand": envelope.annotations['src_strand'],
	if not known_stop:
		annotations['no_stop'] = True
	if not known_start:
		annotations['no_start'] = True

	#return a record
	return SeqRecord(envelope.seq[start:stop],
			features = motifs,
			annotations = annotations)
예제 #7
def locate_ppr(envelope):
    """Find and annotate the protein within"""
    #find all the PPR motifs
    search = HMMER.hmmsearch(hmm=models[3], targets=envelope)
    motifs = search.getFeatures(envelope)

    #A ppr must contain 2 or more PPR motifs
    if len(motifs) < 2:
        return None

    #order the motifs
    motifs.sort(key=lambda m: m.location.start)
    known_start = True
    known_stop = True
    #find start codon
    start = motifs[0].location.start
    while start > 0 and str(envelope.seq[start:start + 3]).lower() != "atg":
        start -= 3
    if start < 0:
        known_start = False
        start = 0

    #find stop codon
    stop = motifs[-1].location.end
    while stop < len(envelope) and (str(envelope.seq[stop:stop + 3]).lower()
                                    not in ["tag", "tga", "taa"]):
        stop += 3
    if stop > len(envelope):
        known_stop = False
        stop = len(envelope)

    #move the motifs
    for m in motifs:
        m.location = FeatureLocation(m.location.start - start,
                                     m.location.end - start)

    #get absolute start and end
    if envelope.annotations['src_strand'] > 0:
        src_from = envelope.annotations['src_from'] + start
        src_to = envelope.annotations['src_from'] + stop
        src_from = envelope.annotations['src_to'] - stop
        src_to = envelope.annotations['src_to'] - start

    annotations = {
        "src_from": src_from,
        "src_to": src_to,
        "src_strand": envelope.annotations['src_strand'],
    if not known_stop:
        annotations['no_stop'] = True
    if not known_start:
        annotations['no_start'] = True

    #return a record
    return SeqRecord(envelope.seq[start:stop],
예제 #8
	def test_annotation(self):
		#load the hmm
		hmm = hmmfile.read('tests/data/valid.hmm')

		s1 = SeqIO.read('tests/data/PPR10.gb', 'genbank')
		s1.features = []
		s2 = SeqIO.read('tests/data/PPR10.gb', 'genbank')
		s2.features = []
		s2.seq = s2.seq.reverse_complement()

		h1 = HMMER.hmmsearch(hmm, s1)
		h2 = HMMER.hmmsearch(hmm, s2)

		for (f1, f2) in zip(s1.features, s2.features):
			self.assertEqual(str(f1.extract(s1.seq)), str(f2.extract(s2.seq)))
예제 #9
	def test_arguments(self):
		seq = SeqIO.read('tests/data/jackhmmer_seq.fasta', 'fasta',
		seqdb = SeqIO.read('tests/data/matchtarget.fasta', 'fasta',

		j = HMMER.jackhmmer(seq, seqdb)
		args = j.getArgs(max=True, E='something')

		self.assertEqual(args, ['--max', '-E', 'something'])
예제 #10
	def test_jackhmmer_dna(self):
		seq = SeqIO.read('tests/data/jackhmmer_seq.fasta', 'fasta', 
		seqdb = SeqIO.read('tests/data/dna_target.fasta', 'fasta',

		j = HMMER.jackhmmer(seq, seqdb)

		#load the expected output
		seqdb_prot = SeqIO.read('tests/data/matchtarget.fasta', 'fasta',
		seq_ = HMMER.wrap_seqrecords([seq,])
		seqdb_ = HMMER.wrap_seqrecords([seqdb_prot,])
		matches = matchfile.load('tests/data/jack_out', seq_, seqdb_)
		#scale the matches' locations to match the protein search
		for m in matches:

		self.assertEqual([str(m) for m in matches], [str(m) for m in j.matches])
예제 #11
def update_models():
	"""Recalculate the HMM models"""
	print "Extracting C-Termini..."
	ct = extract.get_c_terminus(extract.extract(localization=None))
	print "Done. Got {} tails".format(len(ct))
	(E, Ep, DYW) = utils.get_tail_consensus()

	print "E"
	j = HMMER.jackhmmer(E,ct)
	j.hmms[-1].name = 'E'
	hmmfile.write(j.hmms[-1], os.path.join(utils.HMMDir, 'E.hmm'))

	print "E+"
	j = HMMER.jackhmmer(Ep,ct)
	j.hmms[-1].name = 'E+'
	hmmfile.write(j.hmms[-1], os.path.join(utils.HMMDir, 'E+.hmm'))

	print "DYW"
	j = HMMER.jackhmmer(DYW,ct)
	j.hmms[-1].name = 'DYW'
	hmmfile.write(j.hmms[-1], os.path.join(utils.HMMDir, 'DYW.hmm'))
예제 #12
	def test_translation_search(self):
		t = SeqIO.parse('tests/data/dna_target.fasta', 'fasta',
		h = HMMER.hmmsearch('tests/data/valid.hmm', t)
		self.assertEqual(len(h.matches), 17)
		check_valid(self, h.matches)

		#check that matches all have frame 1
		for m in h.matches:
			self.assertEqual(m.getFrame(), 1)

		#test feature extraction
		for m in h.matches:
			f = m.asSeqFeature()
			self.assertEqual(f.qualifiers['frame'], 1)
예제 #13
	def test_search(self):
		t = SeqIO.parse('tests/data/matchtarget.fasta', 'fasta',
		h = HMMER.hmmsearch('tests/data/valid.hmm', t)
		self.assertEqual(len(h.matches), 17)
		check_valid(self, h.matches)
예제 #14
	def setUp(self):
		#build a fake search
		self.hs = HMMER.hmmsearch()
		self.hs.hmms = [self.hmm,]
		self.hs.targets = [self.target,]
예제 #15
def find_homologs():
    """Predict homologs of PPRs in other genomes based on footprints"""
    pprs = load_pprs()
    plastids = load_plastids(exclude=[
        "Arabidopsis thaliana",
    known_binding = SeqIO.read("output/ARA_annotated.gb", "gb")
    exact_features = [
        f for f in known_binding.features if "exact" in f.type.lower()
    ara_genes = [f for f in known_binding.features if f.type.lower() == "gene"]
    ara_genes.sort(key=lambda g: g.location.start)

    print "Loaded {} pprs and {} plastids".format(len(pprs), len(plastids))

    for k, ppr in enumerate(pprs):
        print "Searching for homologs of \'{}\' ({}/{})".format(
            ppr.name, k + 1, len(pprs))
        footprints = [
            f for f in exact_features
            if f.type.lower() == "{}_exact".format(ppr.name.lower())
        ppr.genes = [get_closest_gene(f, ara_genes) for f in footprints]

        print "\tFound {} original genes, {}".format(len(
            ppr.genes), [g.qualifiers['gene'] for g in ppr.genes])

        ppr.potentialHomologs = {}

        for i, plastid in enumerate(plastids):

            if plastid.name != "Alsophila spinulosa":

            print "\t\tSearch {}/{}".format(i + 1, len(plastids))

            #search for homologs of each gene
            homologs = []
            for gene in ppr.genes:
                g = SeqRecord(gene.extract(known_binding.seq).translate())
                search = HMMER.jackhmmer(g, plastid)
                print "{} -> {} homologs".format(gene.qualifiers['gene'],
                homologs += search.getFeatures(

            #extract the sequence surrounding each homolog
            for h in homologs:
                h.location = FeatureLocation(
                    max(0, h.location.start - 500),
                    min(len(plastid), h.location.end + 500))
            homologs = [SeqRecord(h.extract(plastid.seq)) for h in homologs]

            #find exact or close to exact binding domains for each and add to the
            #list of potential homologs for the PPR
            ph = []
            for h in homologs:
                domains = []
                for exact in ppr.exact:
                        domains += binding.get_domains(exact,
                    except KeyError:
                if domains:
                    domains.sort(key=lambda d: -d.qualifiers['odds'])
                    seq = str(domains[0].extract(h).seq)
                    similarity = max([
                        sequence_similarity(original, seq)
                        for original in ppr.footprints
                    print "  {} -> \'{}\'".format(h.type, seq)
                    ph.append((similarity, seq))

            ph.sort(key=lambda p: -p[0])
            ppr.potentialHomologs[plastid.name] = ph

            #try and avoid running out of RAM

    for ppr in pprs:
        print "\'{}\' footprints = {}".format(ppr.name, ppr.footprints)
        print "potential homologs"
        for key, value in ppr.potentialHomologs.iteritems():
            print "{}: {}".format(key, value)


    stats = []
    for plastid in plastids:
        length = 0
        similarity = 0.0
        for ppr in pprs:
            length += len(ppr.potentialHomologs[plastid.name])
            similarity += sum(
                [p[0] for p in ppr.potentialHomologs[plastid.name]])

                'name': plastid.name,
                'avg_similarity': similarity / float(length),
                'avg_homologs': length / len(pprs),
        except ZeroDivisionError:
                'name': plastid.name,
                'avg_similarity': 0.0,
                'avg_homologs': 0,

    stats.sort(key=lambda s: -s['avg_similarity'])

    f = open("tmp", "w")

    for s in stats[0:50]:
        f.write("{name}, {avg_similarity}, {avg_homologs}\n".format(**s))
예제 #16
def simple_extract(target, localization = None, verbose=False):
	"""Extract all the PPRs found in target"""
	if not isinstance(target, SeqRecord):
		raise TypeError("simple_extract requires a Bio.SeqRecord, not {}".format(

	if verbose:
		print "Searching..."
	#find all easy-to-locate PPR motifs
	search = HMMER.hmmsearch(hmm = models[3], targets = target)

	#get features for each motif
	motifs = search.getFeatures(target)

	if verbose:
		print "Got {} motifs, grouping...".format(len(motifs))
	#group features by frame and locatiion
	groups = group_motifs(motifs, max_gap=1500)
	if verbose:
		print "Got {} groups, extracting envelopes...".format(len(groups))

	pprs = []
	dbg_env = []
	while groups:
		if verbose:
			print "Got {} groups, extracting envelopes...".format(len(groups))
		#extract the sequence envelope around each group
		envelopes = [get_envelope(group, target, margin=1000) for group in groups]
		dbg_env += envelopes

		if verbose:
			print "Got {} envelopes, locating PPRs...".format(len(envelopes))
		#locate the PPR within each envelope
		for envelope in envelopes:
			ppr = locate_ppr(envelope)
			if ppr:

		#look for overlapping pprs
		groups = remove_overlaps(pprs)
		ol = len(groups)
		if verbose:
			print "{} conflicts".format(ol)
		groups += remove_overgrown(pprs, 500)
		if verbose:
			print "{} overgrown PPRs".format(len(groups) - ol)
	pprs = [add_source(p, target) for p in pprs]
	if verbose:
		print "Got {} PPRs, cleaning...".format(len(pprs))
	#clean the gaps between features
	pprs = [clean_ends(fill_gaps(ppr)) for ppr in pprs]

	#annotate the tail region and classify each PPR

	#predict each PPR's target
	targetp.targetp(pprs, annotation='localization')

	#filter the desired location
	if localization:
		pprs = [p for p in pprs if p.annotations['localization'] == localization]

	#return a list of nicely presented PPRs
	return pprs
예제 #17
def simple_extract(target, localization=None, verbose=False):
    """Extract all the PPRs found in target"""
    if not isinstance(target, SeqRecord):
        raise TypeError(
            "simple_extract requires a Bio.SeqRecord, not {}".format(

    if verbose:
        print "Searching..."
    #find all easy-to-locate PPR motifs
    search = HMMER.hmmsearch(hmm=models[3], targets=target)

    #get features for each motif
    motifs = search.getFeatures(target)

    if verbose:
        print "Got {} motifs, grouping...".format(len(motifs))
    #group features by frame and locatiion
    groups = group_motifs(motifs, max_gap=1500)
    if verbose:
        print "Got {} groups, extracting envelopes...".format(len(groups))

    pprs = []
    dbg_env = []

    while groups:
        if verbose:
            print "Got {} groups, extracting envelopes...".format(len(groups))
        #extract the sequence envelope around each group
        envelopes = [
            get_envelope(group, target, margin=1000) for group in groups
        dbg_env += envelopes

        if verbose:
            print "Got {} envelopes, locating PPRs...".format(len(envelopes))
        #locate the PPR within each envelope
        for envelope in envelopes:
            ppr = locate_ppr(envelope)
            if ppr:

        #look for overlapping pprs
        groups = remove_overlaps(pprs)
        ol = len(groups)
        if verbose:
            print "{} conflicts".format(ol)
        groups += remove_overgrown(pprs, 500)
        if verbose:
            print "{} overgrown PPRs".format(len(groups) - ol)

    pprs = [add_source(p, target) for p in pprs]

    if verbose:
        print "Got {} PPRs, cleaning...".format(len(pprs))
    #clean the gaps between features
    pprs = [clean_ends(fill_gaps(ppr)) for ppr in pprs]

    #annotate the tail region and classify each PPR

    #predict each PPR's target
    targetp.targetp(pprs, annotation='localization')

    #filter the desired location
    if localization:
        pprs = [
            p for p in pprs if p.annotations['localization'] == localization

    #return a list of nicely presented PPRs
    return pprs
예제 #18
def find_homologs():
	"""Predict homologs of PPRs in other genomes based on footprints"""
	pprs = load_pprs()
	plastids = load_plastids(exclude=["Arabidopsis thaliana",])
	known_binding = SeqIO.read("output/ARA_annotated.gb", "gb")
	exact_features = [f for f in known_binding.features if 
																								"exact" in f.type.lower()]
	ara_genes = [f for f in known_binding.features if f.type.lower() == "gene"]
	ara_genes.sort(key=lambda g: g.location.start)

	print "Loaded {} pprs and {} plastids".format(len(pprs), len(plastids))

	for k,ppr in enumerate(pprs):
		print "Searching for homologs of \'{}\' ({}/{})".format(
		footprints = [f for f in exact_features if 
											f.type.lower() == "{}_exact".format(ppr.name.lower())]
		ppr.genes = [get_closest_gene(f, ara_genes) for f in footprints]

		print "\tFound {} original genes, {}".format(len(ppr.genes), 
				[g.qualifiers['gene'] for g in ppr.genes])

		ppr.potentialHomologs = {}

		for i,plastid in enumerate(plastids):

			if plastid.name != "Alsophila spinulosa":

			print "\t\tSearch {}/{}".format(i+1, len(plastids))

			#search for homologs of each gene
			homologs = []
			for gene in ppr.genes:
				g = SeqRecord(gene.extract(known_binding.seq).translate())
				search = HMMER.jackhmmer(g, plastid)
				print "{} -> {} homologs".format(gene.qualifiers['gene'],
				homologs += search.getFeatures(type="{}_hl".format(gene.qualifiers['gene']))
			#extract the sequence surrounding each homolog
			for h in homologs:
				h.location = FeatureLocation(
						max(0,h.location.start - 500),
						min(len(plastid), h.location.end+500))
			homologs = [SeqRecord(h.extract(plastid.seq)) for h in homologs]

			#find exact or close to exact binding domains for each and add to the
			#list of potential homologs for the PPR
			ph = []
			for h in homologs:
				domains = []
				for exact in ppr.exact:
						domains += binding.get_domains(exact, h, percentile=100.0, gaps=0)
					except KeyError:
				if domains:
					domains.sort(key=lambda d: -d.qualifiers['odds'])
					seq = str(domains[0].extract(h).seq)
					similarity = max([sequence_similarity(original, seq) for 
																									original in ppr.footprints])
					print "  {} -> \'{}\'".format(h.type, seq)
					ph.append((similarity, seq))

			ph.sort(key=lambda p: -p[0])
			ppr.potentialHomologs[plastid.name] = ph
			#try and avoid running out of RAM

	for ppr in pprs:
		print "\'{}\' footprints = {}".format(ppr.name, ppr.footprints)
		print "potential homologs"
		for key,value in ppr.potentialHomologs.iteritems():
			print "{}: {}".format(key, value)


	stats = []
	for plastid in plastids:
		length = 0
		similarity = 0.0
		for ppr in pprs:
			length += len(ppr.potentialHomologs[plastid.name])
			similarity += sum([p[0] for p in ppr.potentialHomologs[plastid.name]])
			stats.append({'name': plastid.name,
									'avg_similarity': similarity / float(length),
									'avg_homologs': length / len(pprs),})
		except ZeroDivisionError:
			stats.append({'name': plastid.name,
										'avg_similarity': 0.0,
										'avg_homologs': 0,})

	stats.sort(key=lambda s: -s['avg_similarity'])

	f = open("tmp", "w")

	for s in stats[0:50]:
		f.write("{name}, {avg_similarity}, {avg_homologs}\n".format(**s))
예제 #19
def fill_gaps(ppr):
	"""Look for reluctant motifs in suspiciously sized gaps"""

	gaps = find_gaps(ppr, mingap=3*30, maxgap=3*40)	
	#check if there could be a motif at the start
	if ppr.features[0].location.start > 3*30:
		gaps.append(FeatureLocation(0, ppr.features[0].location.start, 1))

	for g in gaps:
		record = ppr[g.start-15:g.end+15]
		record.seq = record.seq.translate()

		#increase the reporting thresholds
		search = HMMER.hmmsearch(hmm = models[3], targets = record, 
		if search.matches:
			motif = search.getFeatures(record)[0]
			offset = g.start-15
			motif.location = FeatureLocation(
					offset + 3*motif.location.start,
					offset + 3*motif.location.end,
					strand = 1)
	ppr.features.sort(key=lambda p: p.location.start)
	return ppr