def filterminreads(exp,minreads,logit=True,useabs=False): """ filter away all bacteria that contain less than minreads in all samples together (out of 10k/samples) input: exp : Experiment minreads : float the minimum number of reads total for all samples (and out of 10k/sample) for a bacteria to be kept logit : bool True to add to command log, False to not (if called from another heatsequer function) output: newexp - the filtered experiment """ params=locals() if useabs: numreads=np.sum(np.abs(exp.data),axis=1) else: numreads=np.sum(exp.data,axis=1) keep=np.where(numreads>=minreads) newexp=hs.reorderbacteria(exp,keep[0]) if logit: newexp.filters.append('filter min reads %d' % minreads) hs.addcommand(newexp,"filterminreads",params=params,replaceparams={'exp':exp}) hs.Debug(6,'%d Bacteria left' % len(newexp.sids)) return newexp
def clustersamples(exp,minreads=0): """ cluster samples in an experiment according to similar behavior input: exp :Experiment minreads : int the minimal original number of reads per sample to keep it output: newexp : Experiment the filtered and clustered experiment """ params=locals() newexp=hs.filterorigreads(exp,minreads) # normalize each row (bacteria) to sum 1 dat=copy.copy(newexp.data) dat=np.transpose(dat) dat[dat<=2]=2 dat=np.log2(dat) # cluster dm=spatial.distance.pdist(dat,metric='braycurtis') ll=cluster.hierarchy.single(dm) order=cluster.hierarchy.leaves_list(ll) newexp=hs.reordersamples(newexp,order) hs.addcommand(newexp,"clustersamples",params=params,replaceparams={'exp':exp}) newexp.filters.append("cluster samples minreads=%d" % minreads) return newexp
def normalizeprctile(expdat,percent=80): """ normalize reads per experiment so percentile (rather than mean) will be normalized used to reduce effect of outliers (compositionality correction) note normalization is done on the same set of bacteria for all samples input: expdat : Experiment percent : float the percentile to normalize (0-100) output: newexp : Experiment the new normalized experiment """ params=locals() # select the bacteria to use - don't want to include very low freq. bacteria newexp=hs.filterminreads(expdat,1*len(expdat.samples)) percvals=np.percentile(newexp.data,percent,axis=0) # plt.figure() # plt.plot(percvals) percvals=percvals/np.mean(percvals) newexp=hs.copyexp(expdat) for idx,samp in enumerate(expdat.samples): newexp.data[:,idx]=newexp.data[:,idx]*percvals[idx] newexp.filters.append("normalize percentile %f" % percent) hs.addcommand(newexp,"normalizeprctile",params=params,replaceparams={'expdat':expdat}) return newexp
def toorigreads(expdat,inplace=False): """ convert the number of reads to absolute using the origreads field input: expdat inplace - True to replace current exp, false to create a new one output: newexp - each sample has origreads reads (instead of 10k) """ params=locals() if inplace: newexp=expdat else: newexp=hs.copyexp(expdat) for idx,csamp in enumerate(newexp.samples): totreads=np.sum(newexp.data[:,idx]) origreads=newexp.origreads[idx] if totreads==0: continue ratio=float(origreads)/totreads newexp.data[:,idx]=newexp.data[:,idx]*ratio newexp.filters.append("changed reads to origread value") hs.addcommand(newexp,"toorigreads",params=params,replaceparams={'expdat':expdat}) return newexp
def clusterbacteria(exp,minreads=0,uselog=True): """ cluster bacteria in an experiment according to similar behavior input: exp : Experiment minreads : int the minimal number of reads to keep before clustering (to make faster) uselog : bool True to log transform reads for clustering (before normalizing), false to use full reads output: newexp : Experiment the filtered and clustered experiment """ params=locals() newexp=hs.filterminreads(exp,minreads,logit=False) # normalize each row (bacteria) to sum 1 dat=copy.copy(newexp.data) if uselog: dat[dat<=2]=2 dat=np.log2(dat) dat=scale(dat,axis=1,copy=False) # cluster dm=spatial.distance.pdist(dat,metric='euclidean') ll=cluster.hierarchy.single(dm) order=cluster.hierarchy.leaves_list(ll) newexp=hs.reorderbacteria(newexp,order) hs.addcommand(newexp,"clusterbacteria",params=params,replaceparams={'exp':exp}) newexp.filters.append("cluster bacteria minreads=%d" % minreads) return newexp
def clipseqs(expdat,startpos,addseq='TAC'): """ clip the first nucleotides in all sequences in experiment to fix offset in sequencing input: expdat startpos - the position to start from (0 indexed) or negative to add nucleotides addseq - the sequence to add (just a guess) if startpos is negative output: newexp - new experiment with all sequences clipped and joined identical sequences """ params=locals() newexp=copy.deepcopy(expdat) newseqs=[] newdict={} keeppos=[] for idx,cseq in enumerate(newexp.seqs): if startpos>=0: cseq=cseq[startpos:] else: cseq=addseq[:abs(startpos)]+cseq cseq=cseq[:len(expdat.seqs[0])] if cseq in newdict: newexp.data[newdict[cseq],:] += newexp.data[idx,:] else: newdict[cseq]=idx newseqs.append(cseq) keeppos.append(idx) newexp=reorderbacteria(newexp,keeppos) newexp.seqs=newseqs newexp.seqdict=newdict hs.addcommand(newexp,"clipseqs",params=params,replaceparams={'expdat':expdat}) newexp.filters.append("trim %d nucleotides" % startpos) return newexp
def convertdatefield(expdat,field,newfield,timeformat='%m/%d/%y %H:%M'): """ convert a field containing date/time to a numeric (seocds since epoch) field (create a new field for that) input: expdat : Experiment the experiment to add the field to field : string name of the field containing the date/time format newfield : string name of the new field (with seconds since epoch) timeformat : string format of the date/time field (based on time format) output: newexp : Experiment the experiment with the added time since epoch field """ params=locals() newexp=hs.copyexp(expdat) newexp.fields.append(newfield) for csamp in newexp.samples: newexp.smap[csamp][newfield]=time.mktime(time.strptime(newexp.smap[csamp][field],timeformat)) newexp.filters.append('add time field %s (based on field %s)' % (newfield,field)) hs.addcommand(newexp,"convertdatefield",params=params,replaceparams={'expdat':expdat}) return(newexp)
def normalizereads(expdat,numreads=10000,fixorig=False,inplace=False): """ normalize the number of reads per sample (default to 10k) input: expdat numreads - the number of reads to normalize to fixorig - True to fix origreads with the same ratio, False to keep as before inplace - true to replace orig experiment, false to create a new experiment output: newexp - the normalized experiment """ params=locals() if inplace: newexp=expdat else: newexp=hs.copyexp(expdat) for idx,csamp in enumerate(newexp.samples): totreads=np.sum(newexp.data[:,idx]) if totreads==0: continue ratio=float(numreads)/totreads newexp.data[:,idx]=newexp.data[:,idx]*ratio if fixorig: hs.Debug(2,'fixing original frequencies') newexp.origreads[idx]=float(newexp.origreads[idx])/ratio newexp.filters.append("renormalized reads to sum %d" % numreads) hs.addcommand(newexp,"normalizereads",params=params,replaceparams={'expdat':expdat}) return newexp
def sortsamples(exp,field,numeric=False,logit=True): """ sort samples according to field input: exp : Experiment field : string name of the field to sort by numeric : bool True for numeric values in field, false for text output: newexp : Experiment the sorted experiment """ params=locals() fvals=hs.getfieldvals(exp,field) if numeric: fvals=hs.tofloat(fvals) svals,sidx=hs.isort(fvals) newexp=hs.reordersamples(exp,sidx) if logit: hs.addcommand(newexp,"sortsamples",params=params,replaceparams={'exp':exp}) newexp.filters.append('sorted samples by field %s' % field) return newexp
def filterbacteriafromfile(expdat,filename,exclude=False,subseq=False): """ filter bacteria from an experiment based on a file with sequences (one per line) input: expdat filename - name of the sequence file (1 per line) exclude - remove bacteria from the file instead of keeping them subseq - the sequences in the file can be subsequences of the experiment sequences (different lengths). but slower. output: newexp - the filtered experiment """ params=locals() fl=open(filename,'rU') seqs=[] for cline in fl: seqs.append(cline.strip()) newexp=hs.filterseqs(expdat,seqs,exclude=exclude,subseq=False) filt='Filter sequences from file '+filename if exclude: filt+=' (Exclude)' if subseq: filt+=' (subseq)' newexp.filters.append(filt) hs.addcommand(newexp,"filterbacteriafromfile",params=params,replaceparams={'expdat':expdat}) return newexp
def filterannotations(expdat,annotation,cdb=None,exclude=False): """ filter keeping only samples which have annotation in their cooldb description input: expdat annotation - substring of the annotation (case insensitive) cdb - the database of cool sequences (from cooldb.load()) or None (default) to use the heatsequer loaded cdb exclude - False to keep matching bacteria, True to remove matching bacteria output: newexp - the filtered experiment """ params=locals() if cdb is None: cdb=hs.cdb keeplist=[] for idx,cseq in enumerate(expdat.seqs): keep=False info=hs.cooldb.getseqinfo(cdb,cseq) for cinfo in info: if annotation.lower() in str(cinfo).lower(): keep=True if exclude: keep = not keep if keep: keeplist.append(idx) newexp=hs.reorderbacteria(expdat,keeplist) newexp.filters.append('Filter annotations %s' % annotation) hs.addcommand(newexp,"filterannotations",params=params,replaceparams={'expdat':expdat}) hs.Debug(6,'%d bacteria found' % len(keeplist)) return newexp
def filterknownbact(expdat,cdb=None,exclude=False): """ filter keeping only bacteria which we know about in cooldb input: expdat : Experiment cdb : cooldb the manual annotation database (fromn cooldb.loaddb) exclude : bool True to throw away known bacteria, False to keep only them output: newexp : Experiment the filtered experiment """ params=locals() if cdb is None: cdb=hs.cdb known=[] for idx,cseq in enumerate(expdat.seqs): if len(hs.cooldb.getseqinfo(cdb,cseq))>0: known.append(idx) hs.Debug(2,'Found %d sequences known in cooldb' % len(known)) if exclude: known=set(range(len(expdat.seqs))).difference(known) newexp=hs.reorderbacteria(expdat,known) if not exclude: newexp.filters.append('filter cooldb known bacteria') else: newexp.filters.append('filter exclude cooldb known bacteria') hs.Debug(6,'%d bacteria left' % len(newexp.sids)) newexp.filters.append('keep only sequences from cooldb') hs.addcommand(newexp,"filterknownbact",params=params,replaceparams={'expdat':expdat}) return newexp
def normalizebyseqs(expdat,seqs,exclude=False,fixorig=True): """ normalize experiment by making the sum of frequencies in seqs constant in each sample input: expdat seqs - the sequences to use as the normalization factor (sum of the sequences) exclude - true to use all sequences except in seqs as the normalization factor, False to use seqs fixorig - True to modify the origreads field, false to leave it as it was """ params=locals() newexp=hs.copyexp(expdat) spos=[] for cseq in seqs: spos.append(expdat.seqdict[cseq]) if exclude: spos=np.setdiff1d(np.arange(len(expdat.seqs)),spos) ssum=np.sum(expdat.data[spos,:],axis=0)+0.0 ssum[ssum==0]=1 frat=ssum/np.mean(ssum) for idx in range(len(expdat.samples)): newexp.data[:,idx]=newexp.data[:,idx]/frat[idx] if fixorig: newexp.origreads[idx]=newexp.origreads[idx]/frat[idx] newexp.scalingfactor[idx]=newexp.scalingfactor[idx]*frat filt='Normalize By Seqs ' if len(spos)==1: filt+=newexp.tax[spos[0]] else: filt+=str(len(spos)) if exclude: filt+=' Exclude' newexp.filters.append(filt) hs.addcommand(newexp,"normalizebyseqs",params=params,replaceparams={'expdat':expdat}) return newexp
def filterid(expdat,sids,exclude=False): """ filter bacteria keeping only ones in sids input: expdat : Experiment sids : list of integers the list of (hashed) sequence ids exclude : bool False to keep these bacteria, True to filter away output: newexp : Experiment the filtered experiment """ params=locals() if not type(sids) is list: sids=[sids] keep=[] hs.Debug(1,'filter ids',sids) for cid in sids: for idx,tid in enumerate(expdat.sids): if tid==cid: keep.append(idx) if exclude: keep=set(range(len(expdat.sids))).difference(keep) keep=list(set(keep)) hs.Debug(1,'keep pos',keep) newexp=hs.reorderbacteria(expdat,keep) if exclude: newexp.filters.append('Filter %d ids (exclude)' % len(sids)) else: newexp.filters.append('Filter %d ids' % len(sids)) hs.addcommand(newexp,"filterid",params=params,replaceparams={'expdat':expdat}) return newexp
def trimfieldnames(expdat,field,newfield,trimlen=6): """ trim experiment per sample field values to trimlen input: expdat: Experiment field : str name of the field to trim the values in newfield : str name of the field where to keep the trimmed values trimlen : int >0 : trim keeping first trimlen chars <0 : trim keeping last -trimlen chars output: newexo : Experiment with trimmed field values """ params=locals() for csamp in expdat.samples: cstr=expdat.smap[csamp][field] if trimlen>0: cstr=cstr[:trimlen] else: cstr=cstr[trimlen:] expdat.smap[csamp][newfield]=cstr expdat.fields.append(newfield) expdat.filters.append('Trim field names field %s trimlen %d' % (field,trimlen)) hs.addcommand(expdat,"trimfieldnames",params=params,replaceparams={'expdat':expdat}) return expdat
def sortbyvariance(expdat,field=False,value=False,exact=False,norm=False): """ sort bacteria by their variance sorting is performed based on a subset of samples (field/val/exact) and then all the experiment is sorted according to them input: expdat : Experiment field : string name of the field to filter samples for freq. sorting or False for all samples value : string value of samples to use for the freq. sorting exact : bool is the value exact or partial string norm : bool - False to sort by varinace, True to sort by variance/mean output: newexp : Experiment the experiment with bacteria sorted according to subgroup freq. """ params=locals() if field: texp=hs.filtersamples(expdat,field,value,exact=exact) else: texp=copy.deepcopy(expdat) svals=np.std(texp.data,axis=1) if norm: svals=svals/np.mean(texp.data,axis=1) svals,sidx=hs.isort(svals) newexp=hs.reorderbacteria(expdat,sidx) newexp.filters.append("sort by variance field=%s value=%s normalize=%s" % (field,value,norm)) hs.addcommand(newexp,"sortbyvariance",params=params,replaceparams={'expdat':expdat}) return newexp
def toorigreads(expdat,inplace=False): """ convert the number of reads to absolute using the origreads field input: expdat inplace - True to replace current exp, false to create a new one output: newexp - each sample has origreads reads (instead of 10k) """ params=locals() if inplace: newexp=expdat else: newexp=hs.copyexp(expdat) newexp.data=hs.multvec(newexp.data,newexp.scalingfactor) newexp.data=np.round(newexp.data) newexp.data=newexp.data.astype(int) newexp.scalingfactor=1 newexp.filters.append("changed reads to origread value") hs.addcommand(newexp,"toorigreads",params=params,replaceparams={'expdat':expdat}) return newexp
def sortbacteria(exp,inplace=False,logit=True): """ sort bacteria according to taxonomy (alphabetically) input: exp : experiment the experiment to sort inplace : bool True to sort in place (replace current experiment), False to create a new experiment logit : bool True to add to command log, False to skip (if called from other heatsequer function) output: newexp : experiment The sorted experiment (by taxonomy name) """ params=locals() tax=exp.tax svals,sidx=hs.isort(tax) newexp=hs.reorderbacteria(exp,sidx,inplace=inplace) if logit: newexp.filters.append('sorted bacteria by taxonomy') hs.addcommand(newexp,"sortbacteria",params=params,replaceparams={'exp':exp}) return newexp
def samplemeanpervalue(expdat,field): """ create a new experiment, with 1 sample per value in field, containing the mean of all samples with that value input: expdat : Experiment field : string the field to use (i.e. 'ENV_MATTER') output: newexp : Experiment The new experiment with 1 sample per unique value of field """ params=locals() uvals=hs.getfieldvals(expdat,field,ounique=True) vals=hs.getfieldvals(expdat,field,ounique=False) vdict=hs.listtodict(vals) nsamps=[] for cval in uvals: nsamps.append(vdict[cval][0]) newexp=hs.reordersamples(expdat,nsamps) for idx,cval in enumerate(uvals): cdat=expdat.data[:,vdict[cval]] mv=np.mean(cdat,axis=1) newexp.data[:,idx]=mv newexp.filters.append('samplemeanpervalue for field %s' % field) hs.addcommand(newexp,"samplemeanpervalue",params=params,replaceparams={'expdat':expdat}) return(newexp)
def sortbycentermass(expdat,field=False,numeric=True,uselog=True): """ sort bacteria in the experiment according to a 1d gradient by calculating the center of mass input: expdat field : string the name of the field to sort by or False to skip sorting numeric : bool True if the sort field is numeric (ignored if no sort field) uselog : bool True to log transform the data before mass center calculation output: newexp - the experiment with sorted bacteria """ params=locals() if field: newexp=hs.sortsamples(expdat,field,numeric=numeric) else: newexp=hs.copyexp(expdat) dat=newexp.data if uselog: dat[dat<1]=1 dat=np.log2(dat) cm=[] multpos=np.arange(len(newexp.samples)) for cseqind in range(len(newexp.seqs)): cm.append(np.dot(dat[cseqind,:],multpos)/np.sum(dat[cseqind,:])) sv,si=hs.isort(cm) newexp=hs.reorderbacteria(expdat,si) newexp.filters.append("sort by center of mass field=%s, uselog=%s" % (field,uselog)) hs.addcommand(newexp,"sortbycentermass",params=params,replaceparams={'expdat':expdat}) return newexp
def fieldtobact(expdat,field,bactname='',meanreads=1000,cutoff=0): """ convert values in a map file field to a new bacteria (to facilitate numeric analysis) input: expdat : Experiment field : string name of the field to convert bactname : string name of the new bacteria (empty to have similar to field name) meanreads : int the mean number of reads for the new field bacteria cutoff : int the minimal value of the field per sample (otherwise replace with meanreads) output: newexp : Experiment with added bacteria with the field vals as reads """ params=locals() if len(bactname)==0: bactname=field fv=hs.getfieldvals(expdat,field) vals=np.array(hs.tofloat(fv)) okpos=np.where(vals>=cutoff)[0] badpos=np.where(vals<cutoff)[0] scalefactor=np.mean(vals[okpos]) vals[okpos]=(vals[okpos]/scalefactor)*meanreads vals[badpos]=meanreads newexp=hs.copyexp(expdat) hs.insertbacteria(newexp,vals,bactname,bactname,logit=False) newexp.filters.append('add bacteria from map field %s' % field) hs.addcommand(newexp,"fieldtobact",params=params,replaceparams={'expdat':expdat}) return(newexp)
def reloadmap(expdat,mapfilename): """ reload the mapping file for a loaded experiment input: expdat : Experiment mapfilename : string Name of the mapping file to reload output: newexp : Experiment like expdat but with fields from new map file """ params=locals() newexp=hs.copyexp(expdat) mapsamples,smap,fields,mapmd5=loadmap(mapfilename) newexp.smap=smap newexp.fields=fields newexp.mapmd5=mapmd5 for csamp in newexp.samples: if csamp not in mapsamples: hs.Debug(7,'Sample %s not in new map!' % csamp) newexp.filters.append('reload map %s' % mapfilename) hs.addcommand(newexp,"reloadmapfile",params=params,replaceparams={'expdat':expdat}) return newexp
def filtersamples(expdat,field,filtval,exact=True,exclude=False,numexpression=False,shownumoutput=True): """ filter samples in experiment according to value in field input: exp : Experiment field : string name of the field to filter by filtval : string or list of strings the string to filter (if a list of strings, filter if any in the list) exact : bool True for exact match, False for substring exclude : bool False to keep only matching samples, True to exclude matching samples numexpression : bool True if val is a python expression, False if just a value. For an expression assume value is the beggining of the line (i.e. '<=5') shownumoutput : bool True (default) to show number of samples remaining, False to not show """ params=locals() if not isinstance(filtval,list): filtval=[filtval] keep=[] filt='' for cidx,csamp in enumerate(expdat.samples): keepit=False for filt in filtval: if numexpression: cval=expdat.smap[csamp][field] if len(cval)==0: continue if eval(cval+filt): keepit=True elif exact: if expdat.smap[csamp][field]==filt: keepit=True else: if filt in expdat.smap[csamp][field]: keepit=True # if exclude reverse the decision if exclude: keepit=not keepit if keepit: keep.append(cidx) newexp=hs.reordersamples(expdat,keep) fstr="filter data %s in %s " % (filt,field) if exact: fstr=fstr+"(exact)" else: fstr=fstr+"(substr)" if exclude: fstr+=" (exclude)" newexp.filters.append(fstr) hs.addcommand(newexp,"filtersamples",params=params,replaceparams={'expdat':expdat}) if shownumoutput: hs.Debug(6,'%d Samples left' % len(newexp.samples)) else: hs.Debug(1,'%d Samples left' % len(newexp.samples)) return newexp
def updateorigreads(expdat,logit=True): params=locals() for idx,csamp in enumerate(expdat.samples): expdat.smap[csamp]['origReads']=expdat.origreads[idx] if logit: expdat.filters.append("Update orig reads") hs.addcommand(expdat,"updateorigreads",params=params,replaceparams={}) return expdat
def filterfieldwave(expdat,field,val1,val2=False,mineffect=1,method='mean',uselog=True): """ find all sequences which show an effect size of at least mineffect between val1 and val2 samples in field no statistical significance testing is performed input: expdat : Experiment field : string name of field to use for group separation val1 : string value in field for group1 val2 : string value in field for group2 or False for all the other samples except val1 mineffect : float min difference between groups per OTU in order to keep method: string 'ranksum' uselog : bool True to log transform the data output: newexp : Experiment only with sequences showing a mineffect difference """ params=locals() numseqs=len(expdat.seqs) numsamples=len(expdat.samples) dat=expdat.data if uselog: dat[dat<1]=1 dat=np.log2(dat) if method=='ranksum': for idx in range(numseqs): dat[idx,:]=stats.rankdata(dat[idx,:]) pos1=hs.findsamples(expdat,field,val1) if val2: pos2=hs.findsamples(expdat,field,val2) else: pos2=np.setdiff1d(np.arange(numsamples),pos1,assume_unique=True) outpos=[] odif=[] for idx in range(numseqs): cdif=np.mean(dat[idx,pos1])-np.mean(dat[idx,pos2]) if abs(cdif)>=mineffect: outpos.append(idx) odif.append(cdif) si=np.argsort(odif) outpos=hs.reorder(outpos,si) newexp=hs.reorderbacteria(expdat,outpos) newexp.filters.append('filterfieldwave field %s val1 %s val2 %s' % (field,val1,val2)) hs.addcommand(newexp,"filterfieldwave",params=params,replaceparams={'expdat':expdat}) return newexp
def addsubtrees(expdat,tree,inplace=False): """ add otus for all subtrees with the frequency being the sum of all bacteria in the subtree input: expdat - the experiment tree - the tree for the experiment inplace - if true, replace current experiment output: newexp - the new experiment with twice-1 number of otus """ params=locals() # if not expdat.tree: # hs.Debug(8,"No tree loaded for experiment") # return False if inplace: newexp=expdat else: newexp=hs.copyexp(expdat) subtrees=tree.subsets() for csubtree in subtrees: newname="" newtax="" numuse=0 newfreq=np.zeros([1,len(newexp.samples)]) for cbact in csubtree: if cbact not in newexp.seqdict: hs.Debug(4,'sequence not in seqdict',cbact) continue numuse+=1 cpos=newexp.seqdict[cbact] newfreq+=newexp.data[cpos,:] newname+='%d,' % cpos if newtax=='': newtax=newexp.tax[cpos] else: newtax=hs.common_start(newtax,newexp.tax[cpos]) # add only if we have 2 bacteria or more if numuse>1: if newname not in newexp.seqdict: newexp,newpos=insertbacteria(newexp,freqs=newfreq,seq=newname,tax=newtax,logit=False) newexp.filters.append("Add subtrees") hs.addcommand(newexp,"addsubtrees",params=params,replaceparams={'expdat':expdat}) return(newexp)
def loadrdptax(expdat,rdpfilename,fastaname=False,threshold=60): """ load rdp taxonomy (the output of download allrank in the rdp classifier website) and add to biom table input: expdat - the biom table for which the taxonomy was assigned (sequenced were `d) rdpfilename - name of the saved allrank rdp assignment fastaname - name of fasta file used for rdp assignment (if it was not from saveseqsforrdp) or False if sequences are in the header of the fasta threshold - the assignemt probability threshold under which to not include the assignment (for each level) """ params=locals() if fastaname: seqs,headers=hs.readfastaseqs(fastaname) hdict={} for idx,chead in enumerate(headers): hdict[chead]=seqs[idx] fl=open(rdpfilename,'r') for cline in fl: cline=cline.rstrip() cdat=cline.split(';') # skip header lines if len(cdat)<2: continue # check if sequence in experiment cseq=cdat[0] if fastaname: if cdat[0] in hdict: cseq=hdict[cseq] else: hs.Debug(6,'sequence %s not found in fasta file' % cseq) if cseq not in expdat.seqdict: hs.Debug(6,'sequence %s not found in experiment' % cseq) continue cpos=expdat.seqdict[cseq] ctax='' for idx in np.arange(2,len(cdat),2): cp=cdat[idx+1].rstrip('%') if float(cp)<60: break ctax+=';' ctax+=cdat[idx] expdat.tax[cpos]=ctax fl.close() expdat.filters.append("loaded rdp taxonomy from file %s" % rdpfilename) hs.addcommand(expdat,"loadrdptax",params=params,replaceparams={'expdat':expdat}) return(expdat)
def filtermapfields(expdat,fields=['#SampleID'],keep=True,inplace=False): """ filter fields from the experiment mapping data input: expdat : Experiment fields : list of str the list of the fields to keep/remove keep : bool (optional) True (default) to keep only the fields specified False to remove the fields specified inplace : bool (optional) False (default) to create new experiment True to replace in current experiment output: newexp : Experiment with only the fields requested """ params=locals() newsmap={} newfields=set(expdat.fields) if keep: newfields=newfields.intersection(set(fields)) else: newfields=newfields.difference(set(fields)) newfields.add('#SampleID') for csamp in expdat.samples: newsmap[csamp]={} for cfield in newfields: newsmap[csamp][cfield]=expdat.smap[csamp][cfield] if inplace: newexp=expdat else: newexp=hs.copyexp(expdat) newexp.fields=list(newfields) newexp.smap=newsmap expdat.filters.append('filter map fields %s (keep=%s)' % (fields,keep)) hs.addcommand(expdat,"filtermapfields",params=params,replaceparams={'expdat':expdat}) return newexp
def clearexp(expdat): """ clear experiment from missing samples and bacteria remove samples with <1 reads and bacteria with total <1 reads input: expdat : Experiment output: newexp : Experiment the new experiment without <1 reads samples or bacteria """ params=locals() newexp=filterorigreads(expdat,1) newexp=filterminreads(expdat,1) newexp.filters.append('clear nonpresent bacteria and samples') hs.Debug(6,'%d bacteria left' % len(newexp.sids)) hs.addcommand(newexp,"clearexp",params=params,replaceparams={'expdat':expdat}) return newexp
def joinfields(expdat,field1,field2,newfield): """ join 2 fields to create a new field for each sample input: expdat : Experiment field1,field2 : string name of the 2 fields to join newfield : string name of new field to add """ params=locals() for csamp in expdat.samples: expdat.smap[csamp][newfield]=expdat.smap[csamp][field1]+';'+expdat.smap[csamp][field2] expdat.fields.append(newfield) expdat.filters.append("join fields %s, %s to new field %s" % (field1,field2,newfield)) hs.addcommand(expdat,"joinfields",params=params,replaceparams={'expdat':expdat}) return expdat