示例#1
0
def compare_motif(motif_filename1, motif_filename2):
	cons1, rf1 = furnish_motif(*read_motif(motif_filename1))
	cons2, rf2 = furnish_motif(*read_motif(motif_filename2))
	# remove gaps and furnish consensus so it's just ( ) .
	pr1 = findPRs(cons1)[0]
	pr2 = findPRs(cons2)[0]
	#print cons1, rf1, pr1
	#print cons2, rf2, pr2
	return test2(rf1,rf2,cons1,cons2,pr1,pr2,'XV','VX',2000,True,['junk1'],['junk2'])
def main(sto_filename, num):
	recs = SeqIO.to_dict(SeqIO.parse(open(sto_filename), 'stockholm'))

	ss_cons, rf = miscCMF.read_motif(sto_filename)
	if len(rf) == 0:
		rf = '?' * len(ss_cons)

	print("# STOCKHOLM 1.0")
	for r in random.sample(recs.values(), num):
		if len(r.id) >= SEQ_ID_LEN:
			print >> sys.stderr, "{0} exceeeds seq len! do something!".format(r.id)
		print("{0}{1}{2}".format(r.id, ' '*(SEQ_ID_LEN-len(r.id)), r.seq))
	print("#=GC SS_cons" + ' '*(SEQ_ID_LEN-len("#=GC SS_cons")) + ss_cons)
	print("#=GC RF " + ' '*(SEQ_ID_LEN-len("#=GC RF ")) + rf)
示例#3
0
def main(sto_filename):
	recs = SeqIO.to_dict(SeqIO.parse(open(sto_filename), 'stockholm'))

	ss_cons, rf = miscCMF.read_motif(sto_filename)
	if len(rf) == 0:
		rf = '?' * len(ss_cons)

	print("# STOCKHOLM 1.0")
	for i,r in enumerate(recs.values()):
		id = 'T' + str(i+1)
		print("#=GS {0} AC {1}".format(id, r.id))
		print("{0}{1}{2}".format(id, ' '*(SEQ_ID_LEN-len(id)), r.seq))
	print("#=GC SS_cons" + ' '*(SEQ_ID_LEN-len("#=GC SS_cons")) + ss_cons)
	print("#=GC RF " + ' '*(SEQ_ID_LEN-len("#=GC RF ")) + rf)
	print('//')
def main():
	handles = {}
	os.chdir(MOTIF_DIR)
	for d in os.listdir(os.path.curdir):
		fam = dir_info[d]['family']
		if fam not in handles:
			f = open(os.path.join(os.pardir, STORE_DIR, fam+'.SS_cons.compare.txt'), 'w+')
			handles[fam] = f
			handles[fam].write(">{fam}\n{ss_cons}\n".format(fam=fam, ss_cons=db_summary[fam]['SS_CONS']))
		motif_info = {}
		for file in glob.iglob(d + "/*.cmsearched"):
			motif_name = os.path.splitext(os.path.basename(file))[0]
			print >> sys.stderr, "extracting SS_cons from {0}....".format(motif_name)
			# CMFinder doesn't have the #=GC RF line, so just make a fake XXXX... string
			# since all we want is a furnished SS_cons
			ss_cons = miscCMF.read_motif(os.path.join(d, motif_name))[0]
			ss_cons = miscCMF.furnish_motif(ss_cons, 'X'*len(ss_cons))[0]
			handles[fam].write(">{motif}\n{ss_cons}\n".format(\
					motif=motif_name,\
					ss_cons=ss_cons))
def calc_MCC(filename, fam, output_prefix, db_summary, score_cutoff=0.):
	"""
	Read through the evaled file, and writes out to 
	<output_prefix>.txt --- (per line) 
		motif_name, rank, pscore, tp(count), fp, fn, tn, MCC, home-made-TP/FP score
	"""
	VARNA_APPLET_NAME = "VARNAv37.jar"
	COL_PER_VARNA = 2 # number of cols for Varna-applet motif drawing
	HEIGHT_PER_ROW_VARNA = 400 # per motif drawing height (px)
	WIDTH_VARNA = 1200 # width of the html page for Varna (px)
	import math
	import miscCMF

	f_html = open(output_prefix+'.html', 'w')
	f_out = open(output_prefix+'.txt', 'w')
	f_out.write("MOTIF\tRANK\tPSCORE\tTP\tFP\tFN\tTN\tMCC\tMyScore\n")

	chunk_to_write = [] # list of (MCC, chunk_dict)

	with open(filename) as f:
		i = 0
		for line in f:
			i += 1
			raw = line.strip().split('\t')
			if len(raw) == 6:
				motif_name, rank, pscore, tp_count, tps, fps = raw
			elif len(raw) == 5:
				motif_name, rank, pscore, tp_count, tps = raw
				fps = ''
			elif len(raw) == 4:
				motif_name, rank, pscore, tp_count = raw
				tps = fps = ''
			else:
				raise ValueError, "wacky!!! {0}".format(raw)
			tps = map(float, tps.split(',')) if len(tps) > 0 else []
			fps = map(float, fps.split(',')) if len(fps) > 0 else []
			TP = len(filter(lambda x: x >= score_cutoff, tps)) # the rest are FN
			FP = len(filter(lambda x: x >= score_cutoff, fps)) # the rest are TN
			FN = db_summary[fam]['TRUE'] - TP
			TN = db_summary[fam]['CONTROLS'] - FP
			MCC = ((TP * TN) - (FP * FN))*1. / max(1, math.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)))
			myscore = calc_CMscan_separation_score(tps, fps) / max(1., TP)
			f_out.write(str(motif_name) + '\t' + str(rank)  + '\t' + str(pscore) + '\t' + str(TP) + '\t' + str(FP) + '\t' + str(FN) + '\t' + str(TN) + '\t' + str(MCC) + '\t' + str(myscore) + '\n')

			ind = motif_name.find('.')
			if ind > 0:
				motif_filename = "motifs/{0}/{1}".format(motif_name[:ind], motif_name)
			else:
				motif_filename = "motifs/{0}/{0}".format(motif_name)
			cons, rf = miscCMF.furnish_motif(*miscCMF.read_motif(motif_filename))
			chunk_to_write.append( (MCC, {'motif_name':motif_name, 'cons':cons, 'rf': rf}) )

	# sort chunk_to_write by decreasing order of MCC
	chunk_to_write.sort(key=lambda x: x[0], reverse=True)
	N = len(chunk_to_write)

	# ---------------------- VARNA APPLET HTML WRITING ---------------------- #
	rows = N / COL_PER_VARNA + (N % COL_PER_VARNA > 0)
	f_html.write("""
	<applet  code="VARNA.class"
	codebase="."
	archive="{varna}"
	width="{width}" height="{height}">
	<param name="rows" value="{rows}" />
	<param name="columns" value="{columns}" />
	""".format(\
			varna=VARNA_APPLET_NAME,\
			width=WIDTH_VARNA,\
			height=rows*HEIGHT_PER_ROW_VARNA,\
			columns=COL_PER_VARNA,\
			rows=rows\
			))
	# remember sequence/struuctureDBN<i> has to be 1-based!
	# so must i+1 when using enumerate
	for i, (MCC, chunk_dict) in enumerate(chunk_to_write):
		f_html.write("""
	<param name="sequenceDBN{i}" value="{rf}" />
	<param name="structureDBN{i}" value="{cons}" />
	<param name="titleSize{i}" value="12" />
	<param name="title{i}" value="{motif_name}(MCC {MCC:.2f})" />
	""".format(\
			i=i+1,\
			cons=chunk_dict['cons'],\
			rf=chunk_dict['rf'],\
			motif_name=chunk_dict['motif_name'],\
			MCC=MCC\
			))
	f_html.write("</applet>\n")