Пример #1
0
 def visualize_markup(self, index=1):
     sequences = []
     for seq in self.seq:
         gff_record = next(
             (x for x in self.gff if str(x.id) == str(seq.id)), None)
         if gff_record:
             # Filter 'gene' features
             genes = [x for x in gff_record.features if x.type == 'gene']
             features = []
             for gene in genes:
                 start, end, strand = gene.location.start, gene.location.end, gene.location.strand
                 features.append(
                     GraphicFeature(start,
                                    end,
                                    strand,
                                    label=gene.qualifiers['Name'],
                                    color="#cffccc"))
         sequences.append(features)
     output_file("test.html")
     record = GraphicRecord(sequence_length=1000, features=sequences[index])
     show(record.plot_with_bokeh(figure_width=5))
Пример #2
0
def main(input, faa, fna, fasta, gff, output, log, name):
	gff_path = gff.name
	gff_data = {}
	header = []
	mol_length = {}
	genome = Fasta(fasta)
	for rec in gff:
		rec = rec.rstrip()
		if rec.startswith("##"): continue
		elif rec.startswith("# Seq"):
			header.append(rec.rstrip())
			length = int(rec.split(";")[1].split("=")[1])
		elif rec.startswith("#"): continue
		elif rec == '"': continue
		else:
			line = rec.rstrip().split("\t")
			if line[0] not in gff_data:
				gff_data[line[0]] = {}
			if line[0] not in mol_length:
				mol_length[line[0]] = length
			id = line[8].split(";")[0].split("_")[1]
			id = f"{line[0]}_{id}"
			gff_data[line[0]][id] = line
	# print(f"Sequence length: {mol_length}")
	# print(gff_data)
	hits = {}
	prots = {}
	for line in input:
		if not line.startswith("#"):
			line = line.rstrip().split()
			prots[line[2]] = line[0]
			if line[0] in hits and line[2] not in hits[line[0]]:
				hits[line[0]].append(line[2])
			else:
				hits[line[0]] = [line[2]]
	# import pprint
	# pp = pprint.PrettyPrinter(indent=4)
	# pp.pprint(hits)
	# pp.pprint(prots)
	if "vgrg" in hits:
		for vgrg in hits['vgrg']:
			contig, id = vgrg.rsplit("_",1)
			if f"{contig}_{int(id)-1}" in prots:
				genes = build_cluster(fwd=True, start=f"{contig}_{int(id)-1}", hcp=True, annotations=hits, peptides=prots)
				# print(genes)
			elif  f"{contig}_{int(id)+1}" in prots:
				genes = build_cluster(fwd=False, start=f"{contig}_{int(id)+1}", hcp=True, annotations=hits, peptides=prots)
				# print(genes)
			else:
				genes= build_cluster(fwd=False, start=vgrg, hcp=False, annotations=hits, peptides=prots)
				# print(genes)
			if any(genes[x] in ['hydrolase', 'lipase'] for x in genes):
				print(f"Found Aux1 or Aux4")
				gff_data = add_names(genes, gff_data, "Aux 1 or 4")
			elif any(genes[x] in ['ntpase', 'transferase', 'lysm'] for x in genes):
				print(f"Found Aux2")
				gff_data = add_names(genes, gff_data, "Aux2")
			else:
				print(f"Unknown partial cluster\n{genes}")
	print("Searching for Aux3 cluster")
	cluster3 = {}
	if "aux3" in hits:
		print("Aux3 present")
		for aux3 in hits["aux3"]:
			contig, id = aux3.rsplit("_",1)
			if f"{contig}_{int(id)-1}" in prots:
				cluster3[aux3] = "effector"
				cluster3[f"{contig}_{int(id)-1}"] = "PAAR"
				cluster3[f"{contig}_{int(id)+1}"] = "immunity"
			elif f"{contig}_{int(id)+1}" in prots:
				cluster3[aux3] = "effector"
				cluster3[f"{contig}_{int(id)+1}"] = "PAAR"
				cluster3[f"{contig}_{int(id)-1}"] = "immunity"
			else:
				pass
		# print(cluster3)
		gff_data = add_names(cluster3, gff_data, "Aux3")
	else:
		print("No Aux3")

	nucl = tempfile.NamedTemporaryFile(mode="r+", suffix=".nucl")
	hmmout = tempfile.NamedTemporaryFile(mode="r+", suffix=".bed")
	aux5gff = tempfile.NamedTemporaryFile(mode="r+", suffix=".gff")
	aux5_nhmmer = f" nhmmer --cpu 4 --F1 .1 -o /dev/null --tblout {nucl.name} /home/blast/prediction_server/server/hmm_profiles//aux5 {fasta}"
	print(f"Searching for Aux5 clusters: {aux5_nhmmer}")
	subprocess.call(aux5_nhmmer, shell=True)
	aux5_pres = False
	for l in nucl:
		if l.startswith("#"): continue
		l = l.rstrip().split()
		percID = (max(int(l[7]),int(l[6])) - min(int(l[7]),int(l[6])))/6000
		if percID > 0.9:
			hmmout.write(f"{l[0]}\t{l[8]}\t{l[9]}\n")
			aux5_pres = True
	hmmout.seek(0)
	print(hmmout.read())
	bedtool_cmd = f"bedtools intersect -a {gff_path} -b {hmmout.name} > {aux5gff.name}"
	subprocess.call(bedtool_cmd, shell=True)
	if aux5_pres:
		nucleotides = Fasta(fna.name)
		aux5_fh = tempfile.NamedTemporaryFile(mode="r+", suffix=".fasta")
		for rec in aux5gff:
			rec.rstrip()
			if rec == "": continue
			else:
				line = rec.rstrip().split("\t")
				id = line[8].split(";")[0].split("_")[1]
				id = f"{line[0]}_{id}"
				aux5_fh.write(f">{id}\n{str(nucleotides[id])}\n")
		aux5_fh.seek(0)
		# print(aux5_fh.read())
		aux5hmms = ['aux5_hcp.hmm', 'aux5_vgrg.hmm', 'aux5_tap.hmm', 'aux5_eff.hmm']
		aux5hmmstat = {'aux5_hcp.hmm' : 228, 'aux5_vgrg.hmm': 2100, 'aux5_tap.hmm' : 747, 'aux5_eff.hmm': 1725}
		aux5 = {}
		for hmm in aux5hmms:
			a5hmmout = tempfile.NamedTemporaryFile(mode="r+", suffix=".out")
			a5nhmmer_cmd = f"nhmmer --F1 .1 -o /dev/null --tblout {a5hmmout.name} /home/blast/prediction_server/server/hmm_profiles/{hmm} {aux5_fh.name}"
			print(f"Running nhmmer: {a5nhmmer_cmd}")
			subprocess.call(a5nhmmer_cmd, shell=True, stderr=subprocess.DEVNULL )
			for l in a5hmmout:
				if l.startswith("#"): continue
				l = l.rstrip().split()
				percID = (max(int(l[7]),int(l[6])) - min(int(l[7]),int(l[6])))/aux5hmmstat[hmm]
				if percID > 0.9:
					print(f"Found result for {hmm}: {l[0]}")
					aux5[l[0]] = l[2].split("_")[1]
		# pp.pprint(aux5)
		gff_data = add_names(aux5, gff_data, "Aux5")
	annotation = {}
	uhOhSpaghettiOs = True
	for contig in gff_data:
		annotation[contig] = {}
		features = []
		for seq in gff_data[contig]:
			if gff_data[contig][seq][8].startswith("Name="):
				label = gff_data[contig][seq][8].split(";")[0].split("=")[1]
				# print(f"label={label},start={gff_data[contig][seq][3]}, end={gff_data[contig][seq][4]}, strand={gff_data[contig][seq][6]}1")
				features.append(
					GraphicFeature(start=int(gff_data[contig][seq][3]),
										   end=int(gff_data[contig][seq][4]),
										   strand=int(f"{gff_data[contig][seq][6]}1"),
										   color="#ffd700",
										   label=label)
					)
				annotation[contig][seq] = label
				annotation[seq] = label
			else:
				# print(f"label={None},start={gff_data[contig][seq][3]}, end={gff_data[contig][seq][4]}, strand={gff_data[contig][seq][6]}1")
					features.append(
							GraphicFeature(start=int(gff_data[contig][seq][3]),
								           end=int(gff_data[contig][seq][4]),
								           strand=int(f"{gff_data[contig][seq][6]}1")
								           )
							)

		# print(mol_length[contig])
		# sequence = str(genome[contig])
		if annotation[contig] != {}:
			uhOhSpaghettiOs = False
			record = GraphicRecord(sequence_length=mol_length[contig], features=features)
			plot = record.plot_with_bokeh(figure_width=15)
			with open(f"{output}/{contig}.html", "w+") as f:
				f.write(file_html(plot, CDN, f"{contig}"))
		proteins = Fasta(faa.name)

		with open(f"{output}/proteins.faa", "w+") as p_fh:
			for seq in proteins.keys():
				if seq in annotation:
					p_fh.write(f">{annotation[seq]}\n{str(proteins[seq])}\n")
				else:
					p_fh.write(f">{seq}\n{str(proteins[seq])}\n")
		cds = Fasta(faa.name)
		with open(f"{output}/nucleotides.fna", "w+") as n_fh:
			for seq in cds.keys():
				if seq in annotation:
					n_fh.write(f">{annotation[seq]}\n{str(cds[seq])}\n")
				else:
					n_fh.write(f">{seq}\n{str(cds[seq])}\n")


	if uhOhSpaghettiOs:
		with open(f"{output}/nohits.html", "w+") as sadface:
			sadface.write('<div class="splash"><div class="middle"><h1>No T6SS predicted :(</h1></div><div class="bottomleft"><p>T6SS.Vibriocholera.com</p></div></div>\n')