예제 #1
0
def sortbysign(expdat,field=False,value='',exclude=False,exact=True,maxfval=0.2):
	"""
	sort bacteria in the experiment based on the number of positive/negative samples
	(ignoring nans)
	input:
	expdat : Experiment
	field,value,exclude,exact : name of field and value of field in order to sort based only on these samples
		or field=False for all samples (default)
	maxfval - the maximal f-value

	output:
	newexp : Experiment
		sorted by difference between positive/negative
	"""
	params=locals()

	if field:
		texp=hs.filtersamples(expdat,field,value,exact=exact,exclude=exclude)
	else:
		texp=hs.copyexp(expdat)

	texp.data=np.sign(texp.data)
	numpos=np.nansum(texp.data>0,axis=1)
	numneg=np.nansum(texp.data<0,axis=1)
	pval=np.ones(len(numpos))
	for cpos in range(len(pval)):
		if numpos[cpos]==0 and numneg[cpos]==0:
			continue
		pval1=stats.binom.cdf(numpos[cpos],numpos[cpos]+numneg[cpos],0.5)
		pval2=stats.binom.cdf(numneg[cpos],numpos[cpos]+numneg[cpos],0.5)
		pval[cpos]=np.min([pval1,pval2])

	signs=np.nanmean(texp.data,axis=1)

	fval=hs.fdr(pval)
	keep=np.where(np.array(fval)<=maxfval)[0]
	newexp=hs.reorderbacteria(expdat,keep)
	signs=signs[keep]
	si=np.argsort(signs)

	newexp=hs.reorderbacteria(newexp,si)
	newexp.filters.append("sort by sign field %s max-f-val %f" % (field,maxfval))
	hs.addcommand(newexp,"sortbysign",params=params,replaceparams={'expdat':expdat})
	return newexp
예제 #2
0
def annotationenrichment(expdat,seqs,compareseqs=None,fdrval=0.1):
	"""
	get a list of annotations enriched in seqs compared to random draw from expdat

	input:
	expdat : Experiment
	seqs : list of sequences ('ACGT')
		the sequences in which to test the enrichment
	compareseqs : list of sequences ('ACGT')
		the sequences to compare to
		None (default) to compare to all the experiment

	output:
	newplist - a sorted list of dict for annotaions which are below fdr ('description','pval','observed','expected')
	"""
	# if annotations not initialized - get them (to save time)
	if expdat.seqannotations is None:
		expdat=hs.getexpannotations(expdat,usesupercooldb=False)
	if expdat.annotationseqs is None:
		expdat=hs.getexpannotations(expdat,usesupercooldb=False)

	# count the number of annotations for each term in the group
	# into a dict {term:total number observed in group}
	groupannotationcount=collections.defaultdict(int)
	totgroup=0
	for cseq in seqs:
		for cinfo in expdat.seqannotations[cseq]:
			groupannotationcount[cinfo]+=1
			totgroup+=1

	# count the number of annotations for each term in the comparison group
	# into a dict {term:total number observed in comparison group}
	if compareseqs is None:
		compareseqs=expdat.seqs
	compgroupannotationcount=collections.defaultdict(int)
	totcompgroup=0
	for cseq in compareseqs:
		for cinfo in expdat.seqannotations[cseq]:
			compgroupannotationcount[cinfo]+=1
			totcompgroup+=1

	hs.Debug(6,'%d annotations in group, %d in all' % (totgroup,totcompgroup))
	# calculate the probability per term
	# note: we use a bad calculation (can choose the same term twice for a single bacteria). need to improve (permutations?)
	pvals={}
	allp=[]
	pv=[]
	for cinfo in expdat.annotationseqs.keys():
		pcompgroup=float(compgroupannotationcount[cinfo])/totcompgroup
		pval1=stats.binom.cdf(groupannotationcount[cinfo],totgroup,pcompgroup)
		pval2=stats.binom.cdf(totgroup-groupannotationcount[cinfo],totgroup,1-pcompgroup)
		p=np.min([pval1,pval2])
		# p=pval1
		pvals[cinfo]=p

		allp.append(p)
		cpv={}
		cpv['pval']=p
		cpv['observed']=groupannotationcount[cinfo]
		cpv['expected']=pcompgroup*totgroup
		cpv['description']=cinfo
		pv.append(cpv)

	fval=hs.fdr(allp)
	keep=np.where(np.array(fval)<=fdrval)
	plist=[]
	rat=[]
	for cidx in keep[0]:
		plist.append(pv[cidx])
		rat.append(np.abs(float(pv[cidx]['observed']-pv[cidx]['expected']))/np.mean([pv[cidx]['observed'],pv[cidx]['expected']]))
	si=np.argsort(rat)
	si=si[::-1]
	newplist=[]
	for idx,crat in enumerate(rat):
		newplist.append(plist[si[idx]])
	for cp in newplist:
		if cp['observed']>cp['expected']:
			hs.Debug(6,cp)

	return(newplist)