def findmislabels(expdat,field,distmetric='bc'): """" find mislabelled samples according to field input: expdat : Experiment field : string name of the field to examine (i.e. subjectid) distmetric : string the distance meteric to use (see calcdist) """ expdat=hs.sortsamples(expdat,field) fvals=hs.getfieldvals(expdat,field) ufvals=list(set(fvals)) onames=[] for idx,csamp in enumerate(expdat.samples): onames.append(csamp+';'+fvals[idx]) omat=np.zeros([len(fvals),len(ufvals)]) for groupidx,groupval in enumerate(ufvals): cexp=hs.filtersamples(expdat,field,groupval,exact=True) for aidx,aval in enumerate(expdat.samples): cdist=[] for gidx,gval in enumerate(cexp.samples): # don't measure distance to ourselves if gval==aval: continue cdist.append(hs.calcdist(cexp.data[:,gidx],expdat.data[:,aidx],distmetric=distmetric)) omat[aidx,groupidx]=np.mean(cdist) figure() iax=imshow(omat,interpolation='nearest',aspect='auto') ax=iax.get_axes() ax.set_xticks(range(len(ufvals))) ax.set_xticklabels(ufvals,rotation=90) ax.set_yticks(range(len(onames))) ax.set_yticklabels(onames)
def sortbycentermass(expdat,field=False,numeric=True,uselog=True): """ sort bacteria in the experiment according to a 1d gradient by calculating the center of mass input: expdat field : string the name of the field to sort by or False to skip sorting numeric : bool True if the sort field is numeric (ignored if no sort field) uselog : bool True to log transform the data before mass center calculation output: newexp - the experiment with sorted bacteria """ params=locals() if field: newexp=hs.sortsamples(expdat,field,numeric=numeric) else: newexp=hs.copyexp(expdat) dat=newexp.data if uselog: dat[dat<1]=1 dat=np.log2(dat) cm=[] multpos=np.arange(len(newexp.samples)) for cseqind in range(len(newexp.seqs)): cm.append(np.dot(dat[cseqind,:],multpos)/np.sum(dat[cseqind,:])) sv,si=hs.isort(cm) newexp=hs.reorderbacteria(expdat,si) newexp.filters.append("sort by center of mass field=%s, uselog=%s" % (field,uselog)) hs.addcommand(newexp,"sortbycentermass",params=params,replaceparams={'expdat':expdat}) return newexp
def sortsamples(self): items=self.bMainList.selectedItems() if len(items)!=1: print("Need 1 item") return for citem in items: cname=str(citem.text()) cexp=self.explist[cname] sortsampleswin = SortSamplesWindow(cexp) res=sortsampleswin.exec_() if res==QtGui.QDialog.Accepted: field=str(sortsampleswin.cField.currentText()) newname=str(sortsampleswin.tNewName.text()) cnumeric=sortsampleswin.cNumeric.checkState() if cnumeric==0: numeric=False else: numeric=True overwrite=sortsampleswin.cOverwrite.checkState() newexp=hs.sortsamples(cexp,field=field,numeric=numeric) if overwrite==0: newexp.studyname=newname self.addexp(newexp) else: self.replaceexp(newexp)
def filterwave(expdat,field=False,numeric=True,minfold=2,minlen=3,step=1,direction='up',posloc='start'): """ filter bacteria, keeping only ones that show a consecutive region of samples with higher/lower mean than other samples Done by scanning all windowlen/startpos options for each bacteria input: expdat : Experiment field : string The field to sort by or False to skip sorting numeric : bool For the sorting according to field (does not matter if field is False) minfold : float The minimal fold change for the window compared to the rest in order to keep step : int The skip between tested windows (to make it faster use a larger skip) minlen : int The minimal window len for over/under expression testing direction : string 'both' - test both over and under expression in the window 'up' - only overexpressed 'down' - only underexpressed posloc : string The position to measure the beginning ('maxstart') or middle ('maxmid') of maximal wave or 'gstart' to use beginning of first window with >=minfold change output: newexp : Experiment The filtered experiment, sorted according to window start samples position """ params=locals() # sort if needed if field: newexp=hs.sortsamples(expdat,field,numeric=numeric) else: newexp=hs.copyexp(expdat) dat=newexp.data dat[dat<1]=1 dat=np.log2(dat) numsamples=len(newexp.samples) numbact=len(newexp.seqs) maxdiff=np.zeros([numbact]) maxpos=np.zeros([numbact])-1 maxlen=np.zeros([numbact]) for startpos in range(numsamples-minlen): for cwin in np.arange(minlen,numsamples-startpos,step): meanin=np.mean(dat[:,startpos:startpos+cwin],axis=1) nowin=[] if startpos>0: nonwin=np.arange(startpos-1) if startpos<numsamples: nowin=np.hstack([nowin,np.arange(startpos,numsamples-1)]) nowin=nowin.astype(int) meanout=np.mean(dat[:,nowin],axis=1) cdiff=meanin-meanout if direction=='both': cdiff=np.abs(cdiff) elif direction=='down': cdiff=-cdiff if posloc=='gstart': usepos=np.logical_and(cdiff>=minfold,maxpos==-1) maxpos[usepos]=startpos elif posloc=='start': maxpos[cdiff>maxdiff]=startpos elif posloc=='mid': maxpos[cdiff>maxdiff]=startpos+int(cwin/2) else: hs.Debug('posloc nut supported %s' % posloc) return False maxlen[cdiff>maxdiff]=cwin maxdiff=np.maximum(maxdiff,cdiff) keep=np.where(maxdiff>=minfold)[0] keeppos=maxpos[keep] si=np.argsort(keeppos) keep=keep[si] for ci in keep: hs.Debug(6,'bacteria %s startpos %d len %d diff %f' % (newexp.tax[ci],maxpos[ci],maxlen[ci],maxdiff[ci])) newexp=hs.reorderbacteria(newexp,keep) newexp.filters.append('Filter wave field=%s minlen=%d' % (field,minlen)) hs.addcommand(newexp,"filterwave",params=params,replaceparams={'expdat':expdat}) hs.Debug(6,'%d samples before filtering, %d after' % (len(expdat.samples),len(newexp.samples))) return newexp