示例#1
0
def clusterbacteria(exp,minreads=0,uselog=True):
	"""
	cluster bacteria in an experiment according to similar behavior
	input:
	exp : Experiment
	minreads : int
		the minimal number of reads to keep before clustering (to make faster)
	uselog : bool
		True to log transform reads for clustering (before normalizing), false to use full reads

	output:
	newexp : Experiment
		the filtered and clustered experiment
	"""
	params=locals()

	newexp=hs.filterminreads(exp,minreads,logit=False)
	# normalize each row (bacteria) to sum 1
	dat=copy.copy(newexp.data)
	if uselog:
		dat[dat<=2]=2
		dat=np.log2(dat)
	dat=scale(dat,axis=1,copy=False)
	# cluster
	dm=spatial.distance.pdist(dat,metric='euclidean')
	ll=cluster.hierarchy.single(dm)
	order=cluster.hierarchy.leaves_list(ll)

	newexp=hs.reorderbacteria(newexp,order)
	hs.addcommand(newexp,"clusterbacteria",params=params,replaceparams={'exp':exp})
	newexp.filters.append("cluster bacteria minreads=%d" % minreads)
	return newexp
示例#2
0
def filterminreads(exp,minreads,logit=True,useabs=False):
	"""
	filter away all bacteria that contain less than minreads in all samples together (out of 10k/samples)
	input:
	exp : Experiment
	minreads : float
		the minimum number of reads total for all samples (and out of 10k/sample) for a bacteria to be kept
	logit : bool
		True to add to command log, False to not (if called from another heatsequer function)
	output:
	newexp - the filtered experiment
	"""
	params=locals()

	if useabs:
		numreads=np.sum(np.abs(exp.data),axis=1)
	else:
		numreads=np.sum(exp.data,axis=1)
	keep=np.where(numreads>=minreads)
	newexp=hs.reorderbacteria(exp,keep[0])
	if logit:
		newexp.filters.append('filter min reads %d' % minreads)
		hs.addcommand(newexp,"filterminreads",params=params,replaceparams={'exp':exp})
	hs.Debug(6,'%d Bacteria left' % len(newexp.sids))
	return newexp
示例#3
0
def sortbacteria(exp,inplace=False,logit=True):
	"""
	sort bacteria according to taxonomy (alphabetically)

	input:
	exp : experiment
		the experiment to sort
	inplace : bool
		True to sort in place (replace current experiment), False to create a new experiment
	logit : bool
		True to add to command log, False to skip (if called from other heatsequer function)

	output:
		newexp : experiment
			The sorted experiment (by taxonomy name)
	"""
	params=locals()

	tax=exp.tax
	svals,sidx=hs.isort(tax)
	newexp=hs.reorderbacteria(exp,sidx,inplace=inplace)
	if logit:
		newexp.filters.append('sorted bacteria by taxonomy')
		hs.addcommand(newexp,"sortbacteria",params=params,replaceparams={'exp':exp})
	return newexp
示例#4
0
def sortbycentermass(expdat,field=False,numeric=True,uselog=True):
	"""
	sort bacteria in the experiment according to a 1d gradient by calculating the center of mass
	input:
	expdat
	field : string
		the name of the field to sort by or False to skip sorting
	numeric : bool
		True if the sort field is numeric (ignored if no sort field)
	uselog : bool
		True to log transform the data before mass center calculation
	output:
	newexp - the experiment with sorted bacteria
	"""
	params=locals()

	if field:
		newexp=hs.sortsamples(expdat,field,numeric=numeric)
	else:
		newexp=hs.copyexp(expdat)
	dat=newexp.data
	if uselog:
		dat[dat<1]=1
		dat=np.log2(dat)
	cm=[]
	multpos=np.arange(len(newexp.samples))
	for cseqind in range(len(newexp.seqs)):
		cm.append(np.dot(dat[cseqind,:],multpos)/np.sum(dat[cseqind,:]))
	sv,si=hs.isort(cm)
	newexp=hs.reorderbacteria(expdat,si)
	newexp.filters.append("sort by center of mass field=%s, uselog=%s" % (field,uselog))
	hs.addcommand(newexp,"sortbycentermass",params=params,replaceparams={'expdat':expdat})
	return newexp
示例#5
0
def sortbyvariance(expdat,field=False,value=False,exact=False,norm=False):
	"""
	sort bacteria by their variance
	sorting is performed based on a subset of samples (field/val/exact) and then
	all the experiment is sorted according to them
	input:
	expdat : Experiment
	field : string
		name of the field to filter samples for freq. sorting or False for all samples
	value : string
		value of samples to use for the freq. sorting
	exact : bool
		is the value exact or partial string
	norm : bool
		- False to sort by varinace, True to sort by variance/mean
	output:
	newexp : Experiment
		the experiment with bacteria sorted according to subgroup freq.
	"""
	params=locals()

	if field:
		texp=hs.filtersamples(expdat,field,value,exact=exact)
	else:
		texp=copy.deepcopy(expdat)

	svals=np.std(texp.data,axis=1)
	if norm:
		svals=svals/np.mean(texp.data,axis=1)
	svals,sidx=hs.isort(svals)

	newexp=hs.reorderbacteria(expdat,sidx)
	newexp.filters.append("sort by variance field=%s value=%s normalize=%s" % (field,value,norm))
	hs.addcommand(newexp,"sortbyvariance",params=params,replaceparams={'expdat':expdat})
	return newexp
示例#6
0
def filterid(expdat,sids,exclude=False):
	"""
	filter bacteria keeping only ones in sids
	input:
	expdat : Experiment
	sids : list of integers
		the list of (hashed) sequence ids
	exclude : bool
		False to keep these bacteria, True to filter away
	output:
	newexp : Experiment
		the filtered experiment
	"""
	params=locals()

	if not type(sids) is list:
		sids=[sids]
	keep=[]
	hs.Debug(1,'filter ids',sids)
	for cid in sids:
		for idx,tid in enumerate(expdat.sids):
			if tid==cid:
				keep.append(idx)
	if exclude:
		keep=set(range(len(expdat.sids))).difference(keep)
	keep=list(set(keep))
	hs.Debug(1,'keep pos',keep)
	newexp=hs.reorderbacteria(expdat,keep)
	if exclude:
		newexp.filters.append('Filter %d ids (exclude)' % len(sids))
	else:
		newexp.filters.append('Filter %d ids' % len(sids))
	hs.addcommand(newexp,"filterid",params=params,replaceparams={'expdat':expdat})
	return newexp
示例#7
0
def filterannotations(expdat,annotation,cdb=None,exclude=False):
	"""
	filter keeping only samples which have annotation in their cooldb description
	input:
	expdat
	annotation - substring of the annotation (case insensitive)
	cdb - the database of cool sequences (from cooldb.load()) or None (default) to use the heatsequer loaded cdb
	exclude - False to keep matching bacteria, True to remove matching bacteria

	output:
	newexp - the filtered experiment
	"""
	params=locals()

	if cdb is None:
		cdb=hs.cdb
	keeplist=[]
	for idx,cseq in enumerate(expdat.seqs):
		keep=False
		info=hs.cooldb.getseqinfo(cdb,cseq)
		for cinfo in info:
			if annotation.lower() in str(cinfo).lower():
				keep=True
		if exclude:
			keep = not keep
		if keep:
			keeplist.append(idx)
	newexp=hs.reorderbacteria(expdat,keeplist)
	newexp.filters.append('Filter annotations %s' % annotation)
	hs.addcommand(newexp,"filterannotations",params=params,replaceparams={'expdat':expdat})
	hs.Debug(6,'%d bacteria found' % len(keeplist))
	return newexp
示例#8
0
def filterknownbact(expdat,cdb=None,exclude=False):
	"""
	filter keeping only bacteria which we know about in cooldb
	input:
	expdat : Experiment
	cdb : cooldb
		the manual annotation database (fromn cooldb.loaddb)
	exclude : bool
		True to throw away known bacteria, False to keep only them
	output:
	newexp : Experiment
		the filtered experiment
	"""
	params=locals()

	if cdb is None:
		cdb=hs.cdb
	known=[]
	for idx,cseq in enumerate(expdat.seqs):
		if len(hs.cooldb.getseqinfo(cdb,cseq))>0:
			known.append(idx)
	hs.Debug(2,'Found %d sequences known in cooldb' % len(known))
	if exclude:
		known=set(range(len(expdat.seqs))).difference(known)
	newexp=hs.reorderbacteria(expdat,known)
	if not exclude:
		newexp.filters.append('filter cooldb known bacteria')
	else:
		newexp.filters.append('filter exclude cooldb known bacteria')
	hs.Debug(6,'%d bacteria left' % len(newexp.sids))
	newexp.filters.append('keep only sequences from cooldb')
	hs.addcommand(newexp,"filterknownbact",params=params,replaceparams={'expdat':expdat})
	return newexp
示例#9
0
	def bicluster(self):
		items=self.bMainList.selectedItems()
		if len(items)!=1:
			print("Need 1 item")
			return
		for citem in items:
			cname=str(citem.text())
			cexp=self.explist[cname]

			biclusterwin = BiClusterWindow(cexp,cdb=self.cooldb,bdb=self.bactdb)
			res=biclusterwin.exec_()
			if res==QtGui.QDialog.Accepted:
				newexp=hs.reorderbacteria(cexp,cexp.bactorder)
				newexp=hs.reorderbacteria(newexp,cexp.samporder)
				newexp.studyname=newexp.studyname+'_bicluster'
				newexp.filters.append("bicluster")
				self.addexp(newexp)
示例#10
0
def sortbysign(expdat,field=False,value='',exclude=False,exact=True,maxfval=0.2):
	"""
	sort bacteria in the experiment based on the number of positive/negative samples
	(ignoring nans)
	input:
	expdat : Experiment
	field,value,exclude,exact : name of field and value of field in order to sort based only on these samples
		or field=False for all samples (default)
	maxfval - the maximal f-value

	output:
	newexp : Experiment
		sorted by difference between positive/negative
	"""
	params=locals()

	if field:
		texp=hs.filtersamples(expdat,field,value,exact=exact,exclude=exclude)
	else:
		texp=hs.copyexp(expdat)

	texp.data=np.sign(texp.data)
	numpos=np.nansum(texp.data>0,axis=1)
	numneg=np.nansum(texp.data<0,axis=1)
	pval=np.ones(len(numpos))
	for cpos in range(len(pval)):
		if numpos[cpos]==0 and numneg[cpos]==0:
			continue
		pval1=stats.binom.cdf(numpos[cpos],numpos[cpos]+numneg[cpos],0.5)
		pval2=stats.binom.cdf(numneg[cpos],numpos[cpos]+numneg[cpos],0.5)
		pval[cpos]=np.min([pval1,pval2])

	signs=np.nanmean(texp.data,axis=1)

	fval=hs.fdr(pval)
	keep=np.where(np.array(fval)<=maxfval)[0]
	newexp=hs.reorderbacteria(expdat,keep)
	signs=signs[keep]
	si=np.argsort(signs)

	newexp=hs.reorderbacteria(newexp,si)
	newexp.filters.append("sort by sign field %s max-f-val %f" % (field,maxfval))
	hs.addcommand(newexp,"sortbysign",params=params,replaceparams={'expdat':expdat})
	return newexp
示例#11
0
def filterfieldwave(expdat,field,val1,val2=False,mineffect=1,method='mean',uselog=True):
	"""
	find all sequences which show an effect size of at least mineffect between val1 and val2 samples in field
	no statistical significance testing is performed

	input:
	expdat : Experiment
	field : string
		name of field to use for group separation
	val1 : string
		value in field for group1
	val2 : string
		value in field for group2 or False for all the other samples except val1
	mineffect : float
		min difference between groups per OTU in order to keep
	method: string
		'ranksum'
	uselog : bool
		True to log transform the data

	output:
	newexp : Experiment
		only with sequences showing a mineffect difference
	"""
	params=locals()

	numseqs=len(expdat.seqs)
	numsamples=len(expdat.samples)
	dat=expdat.data
	if uselog:
		dat[dat<1]=1
		dat=np.log2(dat)
	if method=='ranksum':
		for idx in range(numseqs):
			dat[idx,:]=stats.rankdata(dat[idx,:])

	pos1=hs.findsamples(expdat,field,val1)
	if val2:
		pos2=hs.findsamples(expdat,field,val2)
	else:
		pos2=np.setdiff1d(np.arange(numsamples),pos1,assume_unique=True)

	outpos=[]
	odif=[]
	for idx in range(numseqs):
		cdif=np.mean(dat[idx,pos1])-np.mean(dat[idx,pos2])
		if abs(cdif)>=mineffect:
			outpos.append(idx)
			odif.append(cdif)

	si=np.argsort(odif)
	outpos=hs.reorder(outpos,si)
	newexp=hs.reorderbacteria(expdat,outpos)
	newexp.filters.append('filterfieldwave field %s val1 %s val2 %s' % (field,val1,val2))
	hs.addcommand(newexp,"filterfieldwave",params=params,replaceparams={'expdat':expdat})
	return newexp
示例#12
0
	def view(self):
		cexp=self.cexp
		allsamp=np.arange(len(cexp.samples))
		allbact=np.arange(len(cexp.seqs))

		x=np.setdiff1d(allsamp,self.samples)
		sampo=np.concatenate((self.samples,x))
		ubact=[]
		for cseq in self.seqs:
			ubact.append(cexp.seqdict[cseq])
		bacto=np.concatenate((ubact,np.setdiff1d(allbact,ubact)))

		newexp=hs.reorderbacteria(cexp,bacto)
		newexp=hs.reordersamples(newexp,sampo,inplace=True)
		hs.plotexp(newexp,seqdb=self.bactdb,sortby=False,numeric=False,usegui=True,cdb=self.cooldb,showline=False)
示例#13
0
def filterseqs(expdat,seqs,exclude=False,subseq=False,logit=True):
	"""
	filter sequences from the list seqs (keeping sequences appearing in the list)
	input:
	expdat : Experiment
	seqs : string
		a list of (ACGT) sequences to keep
	exclude : bool
		True to filter away instead of keep, False to keep
	subseq : bool
		if true, the sequences can be subsequence (slower). False - look only for exact match.
	logit : bool
		True to add to command log, false to not log it (if called from other heatsequer function)

	output:
	newexp : Experiment
		the filtered experiment
	"""
	params=locals()

	keeplist=[]
	for cseq in seqs:
		if subseq:
			for idx,coseq in enumerate(expdat.seqs):
				if len(cseq)>len(coseq):
					if coseq in cseq:
						keeplist.append(idx)
						break
				else:
					if cseq in coseq:
						keeplist.append(idx)
						break
		else:
			if cseq in expdat.seqdict:
				keeplist.append(expdat.seqdict[cseq])
			else:
				hs.Debug(7,'sequence not in experiment',cseq)
	if exclude:
		keeplist=list(set(range(len(expdat.seqs))).difference(keeplist))
	newexp=hs.reorderbacteria(expdat,keeplist)
	if logit:
		newexp.filters.append('filter sequences')
		hs.addcommand(newexp,"filterseqs",params=params,replaceparams={'expdat':expdat})
	return newexp
示例#14
0
def sortbyfreq(expdat,field=False,value=False,exact=False,exclude=False,logscale=True,useabs=False):
	"""
	sort bacteria in experiment according to frequency
	sorting is performed based on a subset of samples (field/val/exact) and then
	all the experiment is sorted according to them
	input:
	expdat : Experiment
	field : string
		name of the field to filter samples for freq. sorting or False for all samples
	value : string
		value of samples to use for the freq. sorting
	exact : bool
		is the value exact or partial string
	exclude : bool
		True to sort on all samples except the field/value ones, False to sort only on field/value samples (default=False)
	logscale : bool
		True (default) to use log2 transform for frequencies before mean and sorting, False to use original values
	useabs : bool
		True to sort by absolute value of freq, False (default) to sort by freq

	output:
	newexp : Experiment
		the experiment with bacteria sorted according to subgroup freq.
	"""
	params=locals()

	if field:
		texp=hs.filtersamples(expdat,field,value,exact=exact,exclude=exclude)
	else:
		texp=copy.deepcopy(expdat)
	if logscale:
		texp.data[texp.data<2]=2
		texp.data=np.log2(texp.data)
	if useabs:
		meanvals=np.mean(np.abs(texp.data),axis=1)
	else:
		meanvals=np.mean(texp.data,axis=1)
	svals,sidx=hs.isort(meanvals)

	newexp=hs.reorderbacteria(expdat,sidx)
	newexp.filters.append("sort by freq field=%s value=%s" % (field,value))
	hs.addcommand(newexp,"sortbyfreq",params=params,replaceparams={'expdat':expdat})
	return newexp
示例#15
0
def filternans(expdat,minpresence):
	"""
	filter an experiment containing nans in the table, keeping only bacteria with at least minpresence non-nan values
	input:
	expdat : Experiment
	minpresence: int
		minimal number of non-nan samples (keep only if >=)
	output:
	newexp : Experiment
	"""
	params=locals()

	numok=np.sum(np.isfinite(expdat.data),axis=1)
	keep=np.where(numok>=minpresence)[0]
	print(len(keep))
	newexp=hs.reorderbacteria(expdat,keep)
	newexp.filters.append('filternans keep only with >=%d non nan reads' % minpresence)
	hs.addcommand(newexp,"filternans",params=params,replaceparams={'expdat':expdat})
	return newexp
示例#16
0
def sortcorrelation(expdat,method='all'):
	"""
	EXPERIMENTAL
	sort bacteria according to highest correlation/anti-correlation

	input:
	expdat
	method:
		pres - use correlation only on samples where present in one of the two sequnences
		all - use correlation on all samples (default)

	output:
	newexp - the experiment with bacteria sorted by correlation (each time next bacteria the most abs(corr) to the current bacteria)
	"""
	params=locals()

	cdat=copy.copy(expdat.data)
	cdat[cdat<=2]=2
	cdat=np.log2(cdat)
	cdat=scale(cdat,axis=1,copy=False,with_mean=False)
	hs.Debug(6,"Calculating correlation matrix")
	cmat=np.corrcoef(cdat)
	hs.Debug(6,"sorting bacteria")
	cmat=np.abs(cmat)
	cmat-=np.identity(len(expdat.seqs))
	maxpos=np.argmax(cmat)
	maxpos=np.unravel_index(maxpos,np.shape(cmat))
	order=[maxpos[0]]
	ubact=np.arange(len(expdat.seqs))
	ubact=np.delete(ubact,maxpos[0])
	maxpos=maxpos[0]
	while len(ubact)>0:
		cdat=cmat[ubact,maxpos]
		cdat=cdat.flatten()
		maxpos=np.argmax(cdat)
		order.append(ubact[maxpos])
		ubact=np.delete(ubact,maxpos)
	newexp=hs.reorderbacteria(expdat,order)
	newexp.filters.append("correlation sort")
	hs.addcommand(newexp,"sortcorrelation",params=params,replaceparams={'expdat':expdat})
	return newexp
示例#17
0
def filterminsamples(expdat,minsamples):
	"""
	filter away bacteria present in less than frac of the samples
	input:
	expdat : Experiment
	minsamples : int
		the minimal number of samples where the bacteria appears

	output:
	newexp : Experiment
		the filtered experiment
	"""
	params=locals()

	numsamples=np.sum(expdat.data>0,axis=1)
	keep=np.where(numsamples>=minsamples)
	newexp=hs.reorderbacteria(expdat,keep[0])
	newexp.filters.append('filter min samples %d' % minsamples)
	hs.addcommand(newexp,"filterpresence",params=params,replaceparams={'expdat':expdat})
	hs.Debug(6,'%d Bacteria left' % len(newexp.sids))
	return newexp
示例#18
0
def filterpresence(expdat,frac):
	"""
	filter away bacteria present in less than frac of the samples
	input:
	expdat : Experiment
	frac : float
		the minimal fraction of samples to appear in for a beacteria to be kept

	output:
	newexp : Experiment
		the filtered experiment
	"""
	params=locals()

	fracreads=(np.sum(expdat.data>0,axis=1)+0.0)/len(expdat.samples)
	keep=np.where(fracreads>=frac)
	newexp=hs.reorderbacteria(expdat,keep[0])
	newexp.filters.append('filter presence %f' % frac)
	hs.addcommand(newexp,"filterpresence",params=params,replaceparams={'expdat':expdat})
	hs.Debug(6,'%d Bacteria left' % len(newexp.sids))
	return newexp
示例#19
0
def filtermean(expdat,meanval):
	"""
	filter keeping bacteria with a mean frequency >= meanval
	input:
	expdat : Experiment
		the experiment
	meanval : float
		the minimum mean fraction of reads per sample (NOT out of 10k/sample) for a bacteria to be kept
	output:
	newexp : Experiment
		the filtered experiment
	"""
	params=locals()

	meanreads=np.mean(expdat.data,axis=1)
	meantotreads=np.mean(np.sum(expdat.data,0))
	keep=np.where(meanreads>=meanval*meantotreads)
	newexp=hs.reorderbacteria(expdat,keep[0])
	newexp.filters.append('filter mean reads %f' % meanval)
	hs.addcommand(newexp,"filtermean",params=params,replaceparams={'expdat':expdat})
	hs.Debug(6,'%d Bacteria left' % len(newexp.sids))
	return newexp
示例#20
0
def filtertaxonomy(exp,tax,exact=False,exclude=False):
	"""
	filter bacteria matching a given taxonomy name
	input:
	exp : Experiment
	tax : string
		the taxonomy name to filter by
	exact : bool
		True for exact matches to tax string, false for substring
	exclude : bool
		True to throw away matching taxonomy, False to keep matching
	"""
	params=locals()

	match=[]
	for cidx,ctax in enumerate(exp.tax):
		keep=False
		if exact:
			if ctax==tax:
				keep=True
		else:
			if tax in ctax:
				keep=True
		if exclude:
			keep=not keep
		if keep:
			match.append(cidx)
	newexp=hs.reorderbacteria(exp,match)
	filt='filter taxonomy '
	if exact:
		filt+='exact match '
	if exclude:
		filt+='exclude '
	filt+=tax
	newexp.filters.append(filt)
	hs.Debug(6,'%d bacteria left' % len(newexp.sids))
	hs.addcommand(newexp,"filtertaxonomy",params=params,replaceparams={'exp':exp})
	return newexp
示例#21
0
def sortbygroupdiff(expdat,field,val1,val2):
	"""
	sort bacteria in the experiment by the difference in the mean between the 2 groups (val1,val2 in field)
	input:
	expdat
	field - the name of the field for the 2 groups
	val1,val2 - the values for the 2 groups

	output:
	newexp - the experiment with sorted bacteria
	"""
	params=locals()

	exp1=hs.filtersamples(expdat,field,val1,exact=True)
	exp2=hs.filtersamples(expdat,field,val2,exact=True)
	m1=np.mean(np.log2(exp1.data+2),axis=1)
	m2=np.mean(np.log2(exp2.data+2),axis=1)
	diff=(m1-m2)/(m1+m2+20)
	sv,si=hs.isort(diff)
	newexp=hs.reorderbacteria(expdat,si)
	newexp.filters.append("sort by group difference field=%s val1=%s val2=%s" % (field,val1,val2))
	hs.addcommand(newexp,"sortbygroupdiff",params=params,replaceparams={'expdat':expdat})
	return newexp
示例#22
0
def subsample(expdat,numreads=10000,inplace=False):
	"""
	subsample (rarify) reads from all samples in an experiment
	input:
	expdat
	numreads - number of reads to subsample to
	inplace - true to replace current experiment

	output:
	newexp - the new subsampled experiment
	"""
	import biom

	params=locals()

	newexp=hs.filterorigreads(expdat,numreads,inplace)
	newexp=hs.toorigreads(newexp,inplace=True)

	table=biom.table.Table(newexp.data,newexp.seqs,newexp.samples)
	table=table.subsample(numreads,axis='observation')
	tids=table.ids(axis='sample')
	for idx,cid in enumerate(tids):
		if not cid==newexp.samples[idx]:
			print('problem with sample ids!!!!')
	newpos=[]
	for cseq in table.ids(axis='observation'):
		newpos.append(newexp.seqdict[cseq])
	newexp=hs.reorderbacteria(newexp,newpos,inplace=True)
	newexp.data=table.matrix_data.todense().A
	newexp=normalizereads(newexp,numreads=10000,inplace=True,fixorig=False)
	for cidx in range(len(newexp.samples)):
		newexp.origreads[cidx]=numreads
	newexp=updateorigreads(newexp)
	newexp.filters.append("subsample to %d" % numreads)
	hs.addcommand(newexp,"subsample",params=params,replaceparams={'expdat':expdat})
	return newexp
示例#23
0
def filtern(expdat):
	"""
	delete sequences containing "N" from experiment and renormalize
	input:
	expdat : Experiment
	output:
	newexp : Experiment
		experiment without sequences containing "N"
	"""
	params=locals()

	keeplist=[]
	for idx,cseq in enumerate(expdat.seqs):
		if "N" in cseq:
			continue
		if "n" in cseq:
			continue
		keeplist.append(idx)
	newexp=hs.reorderbacteria(expdat,keeplist)
	newexp=hs.normalizereads(newexp)
	newexp.filters.append('Filter sequences containing N')
	hs.addcommand(newexp,"filtern",params=params,replaceparams={'expdat':expdat})
	hs.Debug(6,'%d sequences before filtering, %d after' % (len(expdat.seqs),len(newexp.seqs)))
	return newexp
示例#24
0
def filterwave(expdat,field=False,numeric=True,minfold=2,minlen=3,step=1,direction='up',posloc='start'):
	"""
	filter bacteria, keeping only ones that show a consecutive region of samples with higher/lower mean than other samples
	Done by scanning all windowlen/startpos options for each bacteria
	input:
	expdat : Experiment
	field : string
		The field to sort by or False to skip sorting
	numeric : bool
		For the sorting according to field (does not matter if field is False)
	minfold : float
		The minimal fold change for the window compared to the rest in order to keep
	step : int
		The skip between tested windows (to make it faster use a larger skip)
	minlen : int
		The minimal window len for over/under expression testing
	direction : string
		'both' - test both over and under expression in the window
		'up' - only overexpressed
		'down' - only underexpressed
	posloc : string
		The position to measure the beginning ('maxstart') or middle ('maxmid') of maximal wave
		or 'gstart' to use beginning of first window with >=minfold change

	output:
	newexp : Experiment
		The filtered experiment, sorted according to window start samples position
	"""
	params=locals()

	# sort if needed
	if field:
		newexp=hs.sortsamples(expdat,field,numeric=numeric)
	else:
		newexp=hs.copyexp(expdat)

	dat=newexp.data
	dat[dat<1]=1
	dat=np.log2(dat)
	numsamples=len(newexp.samples)
	numbact=len(newexp.seqs)
	maxdiff=np.zeros([numbact])
	maxpos=np.zeros([numbact])-1
	maxlen=np.zeros([numbact])
	for startpos in range(numsamples-minlen):
		for cwin in np.arange(minlen,numsamples-startpos,step):
			meanin=np.mean(dat[:,startpos:startpos+cwin],axis=1)
			nowin=[]
			if startpos>0:
				nonwin=np.arange(startpos-1)
			if startpos<numsamples:
				nowin=np.hstack([nowin,np.arange(startpos,numsamples-1)])
			nowin=nowin.astype(int)
			meanout=np.mean(dat[:,nowin],axis=1)
			cdiff=meanin-meanout
			if direction=='both':
				cdiff=np.abs(cdiff)
			elif direction=='down':
				cdiff=-cdiff
			if posloc=='gstart':
				usepos=np.logical_and(cdiff>=minfold,maxpos==-1)
				maxpos[usepos]=startpos
			elif posloc=='start':
				maxpos[cdiff>maxdiff]=startpos
			elif posloc=='mid':
				maxpos[cdiff>maxdiff]=startpos+int(cwin/2)
			else:
				hs.Debug('posloc nut supported %s' % posloc)
				return False
			maxlen[cdiff>maxdiff]=cwin
			maxdiff=np.maximum(maxdiff,cdiff)

	keep=np.where(maxdiff>=minfold)[0]
	keeppos=maxpos[keep]
	si=np.argsort(keeppos)
	keep=keep[si]
	for ci in keep:
		hs.Debug(6,'bacteria %s startpos %d len %d diff %f' % (newexp.tax[ci],maxpos[ci],maxlen[ci],maxdiff[ci]))
	newexp=hs.reorderbacteria(newexp,keep)
	newexp.filters.append('Filter wave field=%s minlen=%d' % (field,minlen))
	hs.addcommand(newexp,"filterwave",params=params,replaceparams={'expdat':expdat})
	hs.Debug(6,'%d samples before filtering, %d after' % (len(expdat.samples),len(newexp.samples)))
	return newexp