Exemplo n.º 1
0
	def test_arguments(self):
		seq = SeqIO.read('tests/data/jackhmmer_seq.fasta', 'fasta',
				alphabet=Alphabet.generic_protein)
		seqdb = SeqIO.read('tests/data/matchtarget.fasta', 'fasta',
				alphabet=Alphabet.generic_protein)

		j = HMMER.jackhmmer(seq, seqdb)
		args = j.getArgs(max=True, E='something')

		self.assertEqual(args, ['--max', '-E', 'something'])
Exemplo n.º 2
0
def update_models():
	"""Recalculate the HMM models"""
	print "Extracting C-Termini..."
	ct = extract.get_c_terminus(extract.extract(localization=None))
	print "Done. Got {} tails".format(len(ct))
	
	(E, Ep, DYW) = utils.get_tail_consensus()

	print "E"
	j = HMMER.jackhmmer(E,ct)
	j.hmms[-1].name = 'E'
	hmmfile.write(j.hmms[-1], os.path.join(utils.HMMDir, 'E.hmm'))

	print "E+"
	j = HMMER.jackhmmer(Ep,ct)
	j.hmms[-1].name = 'E+'
	hmmfile.write(j.hmms[-1], os.path.join(utils.HMMDir, 'E+.hmm'))

	print "DYW"
	j = HMMER.jackhmmer(DYW,ct)
	j.hmms[-1].name = 'DYW'
	hmmfile.write(j.hmms[-1], os.path.join(utils.HMMDir, 'DYW.hmm'))
Exemplo n.º 3
0
	def test_jackhmmer(self):
		seq = SeqIO.read('tests/data/jackhmmer_seq.fasta', 'fasta', 
				alphabet=Alphabet.generic_protein)
		seqdb = SeqIO.read('tests/data/matchtarget.fasta', 'fasta',
				alphabet=Alphabet.generic_protein)

		j = HMMER.jackhmmer(seq, seqdb)

		#load the expected output
		seq_ = HMMER.wrap_seqrecords([seq,])
		seqdb_ = HMMER.wrap_seqrecords([seqdb,])
		m = matchfile.load('tests/data/jack_out', seq_, seqdb_)

		self.assertEqual(m, j.matches)
Exemplo n.º 4
0
	def test_jackhmmer_dna(self):
		seq = SeqIO.read('tests/data/jackhmmer_seq.fasta', 'fasta', 
				alphabet=Alphabet.generic_protein)
		seqdb = SeqIO.read('tests/data/dna_target.fasta', 'fasta',
				alphabet=Alphabet.generic_dna)

		j = HMMER.jackhmmer(seq, seqdb)


		#load the expected output
		seqdb_prot = SeqIO.read('tests/data/matchtarget.fasta', 'fasta',
				alphabet=Alphabet.generic_protein)
		seq_ = HMMER.wrap_seqrecords([seq,])
		seqdb_ = HMMER.wrap_seqrecords([seqdb_prot,])
		matches = matchfile.load('tests/data/jack_out', seq_, seqdb_)
		#scale the matches' locations to match the protein search
		for m in matches:
			m.scale(3)

		self.assertEqual([str(m) for m in matches], [str(m) for m in j.matches])
Exemplo n.º 5
0
def find_homologs():
    """Predict homologs of PPRs in other genomes based on footprints"""
    pprs = load_pprs()
    plastids = load_plastids(exclude=[
        "Arabidopsis thaliana",
    ])
    known_binding = SeqIO.read("output/ARA_annotated.gb", "gb")
    exact_features = [
        f for f in known_binding.features if "exact" in f.type.lower()
    ]
    ara_genes = [f for f in known_binding.features if f.type.lower() == "gene"]
    ara_genes.sort(key=lambda g: g.location.start)

    print "Loaded {} pprs and {} plastids".format(len(pprs), len(plastids))

    for k, ppr in enumerate(pprs):
        print "Searching for homologs of \'{}\' ({}/{})".format(
            ppr.name, k + 1, len(pprs))
        footprints = [
            f for f in exact_features
            if f.type.lower() == "{}_exact".format(ppr.name.lower())
        ]
        ppr.genes = [get_closest_gene(f, ara_genes) for f in footprints]

        print "\tFound {} original genes, {}".format(len(
            ppr.genes), [g.qualifiers['gene'] for g in ppr.genes])

        ppr.potentialHomologs = {}

        for i, plastid in enumerate(plastids):

            if plastid.name != "Alsophila spinulosa":
                continue

            print "\t\tSearch {}/{}".format(i + 1, len(plastids))

            #search for homologs of each gene
            homologs = []
            for gene in ppr.genes:
                g = SeqRecord(gene.extract(known_binding.seq).translate())
                search = HMMER.jackhmmer(g, plastid)
                print "{} -> {} homologs".format(gene.qualifiers['gene'],
                                                 len(search.matches))
                homologs += search.getFeatures(
                    type="{}_hl".format(gene.qualifiers['gene']))

            #extract the sequence surrounding each homolog
            for h in homologs:
                h.location = FeatureLocation(
                    max(0, h.location.start - 500),
                    min(len(plastid), h.location.end + 500))
            homologs = [SeqRecord(h.extract(plastid.seq)) for h in homologs]

            #find exact or close to exact binding domains for each and add to the
            #list of potential homologs for the PPR
            ph = []
            for h in homologs:
                domains = []
                for exact in ppr.exact:
                    try:
                        domains += binding.get_domains(exact,
                                                       h,
                                                       percentile=100.0,
                                                       gaps=0)
                    except KeyError:
                        continue
                if domains:
                    domains.sort(key=lambda d: -d.qualifiers['odds'])
                    seq = str(domains[0].extract(h).seq)
                    similarity = max([
                        sequence_similarity(original, seq)
                        for original in ppr.footprints
                    ])
                    print "  {} -> \'{}\'".format(h.type, seq)
                    ph.append((similarity, seq))

            ph.sort(key=lambda p: -p[0])
            ppr.potentialHomologs[plastid.name] = ph

            #try and avoid running out of RAM
            gc.collect()

    for ppr in pprs:
        print "\'{}\' footprints = {}".format(ppr.name, ppr.footprints)
        print "potential homologs"
        for key, value in ppr.potentialHomologs.iteritems():
            print "{}: {}".format(key, value)

    return

    stats = []
    for plastid in plastids:
        length = 0
        similarity = 0.0
        for ppr in pprs:
            length += len(ppr.potentialHomologs[plastid.name])
            similarity += sum(
                [p[0] for p in ppr.potentialHomologs[plastid.name]])

        try:
            stats.append({
                'name': plastid.name,
                'avg_similarity': similarity / float(length),
                'avg_homologs': length / len(pprs),
            })
        except ZeroDivisionError:
            stats.append({
                'name': plastid.name,
                'avg_similarity': 0.0,
                'avg_homologs': 0,
            })

    stats.sort(key=lambda s: -s['avg_similarity'])

    f = open("tmp", "w")

    for s in stats[0:50]:
        f.write("{name}, {avg_similarity}, {avg_homologs}\n".format(**s))
    f.close()
Exemplo n.º 6
0
def find_homologs():
	"""Predict homologs of PPRs in other genomes based on footprints"""
	pprs = load_pprs()
	plastids = load_plastids(exclude=["Arabidopsis thaliana",])
	known_binding = SeqIO.read("output/ARA_annotated.gb", "gb")
	exact_features = [f for f in known_binding.features if 
																								"exact" in f.type.lower()]
	ara_genes = [f for f in known_binding.features if f.type.lower() == "gene"]
	ara_genes.sort(key=lambda g: g.location.start)

	print "Loaded {} pprs and {} plastids".format(len(pprs), len(plastids))

	for k,ppr in enumerate(pprs):
		print "Searching for homologs of \'{}\' ({}/{})".format(
				ppr.name,k+1,len(pprs))
		footprints = [f for f in exact_features if 
											f.type.lower() == "{}_exact".format(ppr.name.lower())]
		ppr.genes = [get_closest_gene(f, ara_genes) for f in footprints]

		print "\tFound {} original genes, {}".format(len(ppr.genes), 
				[g.qualifiers['gene'] for g in ppr.genes])

		ppr.potentialHomologs = {}

		for i,plastid in enumerate(plastids):

			if plastid.name != "Alsophila spinulosa":
				continue

			print "\t\tSearch {}/{}".format(i+1, len(plastids))

			#search for homologs of each gene
			homologs = []
			for gene in ppr.genes:
				g = SeqRecord(gene.extract(known_binding.seq).translate())
				search = HMMER.jackhmmer(g, plastid)
				print "{} -> {} homologs".format(gene.qualifiers['gene'],
						len(search.matches))
				homologs += search.getFeatures(type="{}_hl".format(gene.qualifiers['gene']))
			
			#extract the sequence surrounding each homolog
			for h in homologs:
				h.location = FeatureLocation(
						max(0,h.location.start - 500),
						min(len(plastid), h.location.end+500))
			homologs = [SeqRecord(h.extract(plastid.seq)) for h in homologs]

			#find exact or close to exact binding domains for each and add to the
			#list of potential homologs for the PPR
			ph = []
			for h in homologs:
				domains = []
				for exact in ppr.exact:
					try:
						domains += binding.get_domains(exact, h, percentile=100.0, gaps=0)
					except KeyError:
						continue
				if domains:
					domains.sort(key=lambda d: -d.qualifiers['odds'])
					seq = str(domains[0].extract(h).seq)
					similarity = max([sequence_similarity(original, seq) for 
																									original in ppr.footprints])
					print "  {} -> \'{}\'".format(h.type, seq)
					ph.append((similarity, seq))

			ph.sort(key=lambda p: -p[0])
			ppr.potentialHomologs[plastid.name] = ph
			
			#try and avoid running out of RAM
			gc.collect()

	for ppr in pprs:
		print "\'{}\' footprints = {}".format(ppr.name, ppr.footprints)
		print "potential homologs"
		for key,value in ppr.potentialHomologs.iteritems():
			print "{}: {}".format(key, value)

	return

	
	stats = []
	for plastid in plastids:
		length = 0
		similarity = 0.0
		for ppr in pprs:
			length += len(ppr.potentialHomologs[plastid.name])
			similarity += sum([p[0] for p in ppr.potentialHomologs[plastid.name]])
		
		try:
			stats.append({'name': plastid.name,
									'avg_similarity': similarity / float(length),
									'avg_homologs': length / len(pprs),})
		except ZeroDivisionError:
			stats.append({'name': plastid.name,
										'avg_similarity': 0.0,
										'avg_homologs': 0,})

	stats.sort(key=lambda s: -s['avg_similarity'])

	f = open("tmp", "w")

	for s in stats[0:50]:
		f.write("{name}, {avg_similarity}, {avg_homologs}\n".format(**s))
	f.close()