def main():
	os.chdir(MOTIF_DIR)
	fieldnames = ['FAMILY', 'FAMSIZE', 'MOTIF', 'RANK', 'PSCORE', 'TOTAL_RECALL', 'TPS', 'FPS']
	out_csv = csv.writer(sys.stdout, fieldnames, delimiter='\t') # NOTE THAT we CAN't USE comma as delim becuz we use then for TP,FP scores!
	out_csv.writerow(fieldnames)
	for d in os.listdir(os.path.curdir):
		fam = dir_info[d]['family']
		motif_info = {}
		# read the .fna.summary.pscore_added file
		for obj in csv.DictReader(\
				open(os.path.join(d, d + '.fna.summary.pscore_added')),\
				delimiter=','):
			motif_name = os.path.basename(obj['motif']) 
			motif_info[motif_name] = obj
		for file in glob.iglob(d + "/*.cmsearched"):
			print >> sys.stderr, "reading {0}....".format(file)
			tp_count = 0
			tp = []
			fp = []
			for id,(score,start,end) in read_tab_cmscanned(file, 'score', 0).iteritems():
				if id.startswith('FAKE_'): # is a FP!
					fp.append( str(score) )
				else:
					tp.append( str(score) )
					tp_count += 1
			motif_name = os.path.splitext(os.path.basename(file))[0]
			out_csv.writerow([fam, \
					db_summary[fam]['TRUE'], \
					motif_name, \
					motif_info[motif_name]['Rank.index'],\
					motif_info[motif_name]['pscore'],\
					tp_count,\
					",".join(tp),\
					",".join(fp)])
示例#2
0
def run_infernal(motif, scan_filename, output_prefix):
	import miscInfernal
	# run infernal (NOTE THE HARD-CODED stuff!!! should change later)
	os.system("cmbuild {motif}.cmbuilded {motif}".format(motif=motif))
	os.system("cmsearch --fil-T-hmm 10 --toponly -T 10 " + \
			"--tabfile {out}.hits -o {out}.cmscanned ".format(out=output_prefix) + \
			"{motif}.cmbuilded {scan}".format(motif=motif, scan=scan_filename))

	result = miscInfernal.read_tab_cmscanned(tab_filename=output_prefix+'.hits')
	# for simplicity, remove from result hits that are motif members
	# currently the IDs are HARDCODED like:  >AARF01000040.1/2307-2550(1-244)
	for id in os.popen("grep \">\" {motif}.fasta".format(motif=motif)).read().strip().split('\n'):
		try:
			_id = id[1:id.find('(')]
			print >> sys.stderr, "removing from cmscan hits {0}....".format(_id)
			del result[_id]
		except:
			pass

	return result
示例#3
0
def evaluate_CMscaned(dir, scan_fasta_pickle, total_ranks_pickle, best_e_or_score, cutoff,\
		eval_func=None,\
		suffix='.nocalib_hmmT10.cmsearched_hits'):
	"""
	<dir> should be a directory containing files of format <motif_filename>.cmsearched_hits
	which are outputs from Infernal-1.0's cmsearch use --tabfile option.

	<total_ranks_pickle> can either be from rankpl or pscore. Both will be a list of
	cluster_steps4.MotifRankInfo objects.
	"""
	if eval_func is None:
		eval_func = lambda id,(best,loc_start,loc_end): id[:id.index('___')]

	with open(total_ranks_pickle) as handle:
		total_ranks = load(handle)
	with open(scan_fasta_pickle) as handle:
		total_fam_counts = load(handle)
		# some of the total_fam_counts have keys that are like moco-
		# we'll put them as moco...
		for k in total_fam_counts:
			if k is not None and k.endswith('-'):
				total_fam_counts[k[:-1]] += total_fam_counts[k]
				del total_fam_counts[k]

	best_cmscan_by_family = defaultdict(lambda: {'sens':0., 'prec':0., 'motif':None})
	for_scatterplot = {'real':[], 'random':[]}

	# go through each MotifRankInfo obj, if family is not None, then we did a CM scan for it
	# refer to cluster_steps_generate_CMscan_cmds.py to see why this is the case
	for obj in filter(lambda obj: obj.fam is not None, total_ranks):
		# some hardcoded crap here....*sigh*, right now the motif_filename is stored like:
		# grid01_02_124.fna.1.motif.h1_2.dup_rmed.html (has .dup_rmed. it's used with rankpl)
		filename = obj.motif_filename[:obj.motif_filename.rfind('.html')] + suffix
		if not os.path.exists(os.path.join(dir, filename)):
			print("ERROR: file {0} doesn't exist".format(filename))
			continue

		# returns scanned_id --> (best_e_or_score, start, end)
		print >> sys.stderr, "file is {0}, family should be {1}".format(filename, obj.fam)
		cm_results = miscInfernal.read_tab_cmscanned(os.path.join(dir, filename), best_e_or_score, cutoff)

		if len(cm_results) == 0:
			continue
		tally_by_family = defaultdict(lambda: set())
		for id,(best,loc_start,loc_end) in cm_results.iteritems():
#			print >> sys.stderr, id, loc_start, loc_end
#			(_acc,_junk),_strand,_start,_end = parsed_accID(id,version_split=True,loc_start=loc_start,loc_end=loc_end)
#			print >> sys.stderr, "parsed into", _acc, _start, _end
#			rb_id,rb_fam = get_ribo1(_acc, _start, _end)
#			print >> sys.stderr, "rb is", rb_id, rb_fam
#			raw_input()
#			if rb_fam is not None and rb_fam.endswith('-'):
#				rb_fam = rb_fam[:-1]

#			print >> sys.stderr, "evaling", id, best, loc_start, loc_end
			rb_fam_with_id = eval_func(id, (best,loc_start,loc_end)) # in format fam___id
			rb_fam,rb_id = rb_fam_with_id.split('___')
			tally_by_family[rb_fam].add( rb_id )
			if rb_fam == obj.fam:
				for_scatterplot['real'].append( best)
			else:
				for_scatterplot['random'].append( best )
		fam = obj.fam
		for t in tally_by_family:
			tally_by_family[t] = len(tally_by_family[t])
		if fam not in tally_by_family:
			tally_by_family[fam] = 0 # we need to do this otherwise next two lines with fail
		# calculate sensitivity and precision(PPV)
		sens = tally_by_family[fam]*1./total_fam_counts[fam]
		prec = tally_by_family[fam]*1./len(cm_results)
#		print >> sys.stderr, "motif:", os.path.join(dir,filename), "fam: ", fam, "sens: ", sens, "prec: " , prec
		if sens > best_cmscan_by_family[fam]['sens'] or \
			sens==best_cmscan_by_family[fam]['sens'] and prec > best_cmscan_by_family[fam]['prec']:
			best_cmscan_by_family[fam] = {'sens': sens, 'prec': prec, 'motif': filename}

	fams = best_cmscan_by_family.keys()
	fams.sort(key=str.lower)
	# print the final results in latex table format
	for fam in fams:
		print("{fam} & {count} & {sens:.2f} & {prec:.2f} & {motif}\\\\hline".format(fam=fam, \
				count=total_fam_counts[fam], sens=best_cmscan_by_family[fam]['sens'], \
				prec=best_cmscan_by_family[fam]['prec'],\
				motif=best_cmscan_by_family[fam]['motif']))

	# print for_scatterplot
	print(",".join(map(str, for_scatterplot['real'] ) ) )
	print(",".join(map(str, for_scatterplot['random'] ) ))