def load(tablename, mapname='map.txt', taxfile='', nameisseq=True,studyname=False,tabletype='biom',normalize=True,addsname='',keepzero=False,removefrom=False,removenum=1,mapsampletolowercase=False,sortit=True,useseqnamefortax=True,rawreads=False,usesparse=False): """ Load an experiment - a biom table and a mapping file input: tablename - the name of the biom table file mapname - name of the mapping file taxfile - empty ('') to load taxonomy from biom table, non-empty to load from rdp output file (web) nameisseq - False to keep otu name as sid without hashing it, True to treat otuid as sequence addsname - a string to add to each table sample name (or empty to not add) studyname - Flase to assign from table file name, otherwise string to store as study name tabletype: 'biom' - a biom table 'meta' - a metabolomics table (row per sample, col per metabolite, can contain duplicate metaboliteids) normalize - True to normalize to 10k reads per sample, False to not normalize (change to mean 10k reads/sample) keepzero : bool True (default) to keep samples with 0 reads, False to throw away removefrom : string if non empty - cut table sample name after (and including) the first occurance of removefrom mapsampletolowercase : bool True to convert the mapping file sample id to lower case (for EMP data). default=False sortit : bool True (default) to sort sequences by taxonomy, False to not sort useseqnamefortax : bool True (default) to use the sequence as taxonomy if no taxonomy supplied, False to use 'unknown' rawreads : bool True in combination with normalize=False - do not modify read count to mean 10k usesparse : book True to use sparse matrix representation, False to use non-sparse (default) output: an experiment class for the current experiment """ params=locals() # load the table if tabletype=='biom': hs.Debug(6,'Loading biom table') table=biom.load_table(tablename) elif tabletype=='meta': hs.Debug(6,'Loading metabolite table') table=loadmetabuckettable(tablename) else: hs.Debug(9,'Table type %s not supported' % tabletype) return False datamd5g=hashlib.md5() datamd5g.update(table.matrix_data.todense().A.view(np.uint8)) datamd5=datamd5g.hexdigest() print(datamd5) # if need to cut table sample names if removefrom: idtable={} foundids={} ids=table.ids(axis='sample') if len(set(ids))!=len(ids): hs.Debug(8,'non unique ids identified') for cid in ids: if removefrom in cid: fpos=hs.findn(cid,removefrom,removenum) if fpos==-1: hs.Debug(6,'Did not find enough %s in %s' % (removefrom,cid)) tid=cid else: tid=cid[:fpos] else: hs.Debug(6,'%s not found in sample name %s (removefrom)' % (removefrom,cid)) tid=cid if tid in foundids: hs.Debug(6,'already have id %s' % cid) foundids[tid]+=1 idtable[cid]=tid+'-rep-'+str(foundids[tid]) print(idtable[cid]) else: foundids[tid]=1 idtable[cid]=tid hs.Debug(6,'found %d keys %d values' % (len(set(idtable.keys())),len(set(idtable.values())))) table=table.update_ids(idtable,axis='sample') # if need to add constant string to sample names in table if addsname!='': idtable={} ids=table.ids(axis='sample') for cid in ids: idtable[cid]=addsname+cid table=table.update_ids(idtable,axis='sample') smap = {} mapsamples = [] mapmd5='' if mapname: # if mapping file supplied, load it mapsamples,smap,fields,mapmd5=loadmap(mapname,mapsampletolowercase=mapsampletolowercase) else: # no mapping file, so just create the #SampleID field hs.Debug(6,'No mapping file supplied - using just sample names') tablesamples = table.ids(axis='sample') for cid in tablesamples: smap[cid]={'#SampleID':cid} mapsamples.append(cid) fields=['#SampleID'] mapmd5='' # remove table samples not in mapping file tablesamples = table.ids(axis='sample') hs.Debug(6,'number of samples in table is %d' % len(tablesamples)) removelist=[] for cid in tablesamples: if cid not in mapsamples: removelist.append(cid) hs.Debug(6,'Table sample %s not found in mapping file' % cid) hs.Debug(6,'removing %s samples' % len(removelist)) if len(removelist)>0: table=table.filter(removelist,axis='sample',invert=True) tablesamples = table.ids(axis='sample') hs.Debug(6,'deleted. number of samples in table is now %d' % len(tablesamples)) # remove samples not in table from mapping file removemap=[] addlist=[] for idx,cmap in enumerate(mapsamples): if cmap not in tablesamples: hs.Debug(2,'Map sample %s not in table' % cmap) if not keepzero: removemap.append(idx) try: del smap[cmap] except: hs.Debug(8,'Duplicate SampleID %s in mapping file' % cmap) else: addlist.append(cmap) if len(removemap)>0: hs.Debug(7,'removing %d samples from mapping file' % len(removemap)) mapsamples=hs.delete(mapsamples,removemap) hs.Debug(7,'number of samples in mapping file is now %d' % len(mapsamples)) # get info about the sequences tableseqs = table.ids(axis='observation') sids = [] tax = [] osnames=[] for cid in tableseqs: # get the original sample name osnames.append(cid) # get the sid (hash ) if nameisseq: sids.append(hs.hashseq(cid)) else: sids.append(cid) # get the taxonomy string ctax=gettaxfromtable(table,cid,useseqname=useseqnamefortax) tax.append(ctax) if not studyname: studyname=os.path.basename(tablename) exp=hs.Experiment() exp.datatype=tabletype if usesparse: exp.data=scipy.sparse.dok_matrix(table.matrix_data) else: exp.data=table.matrix_data.todense().A # check if need to add the 0 read samples to the data if len(addlist)>0: tablesamples=list(tablesamples) tablesamples=tablesamples+addlist exp.data=np.hstack([exp.data,np.zeros([np.shape(exp.data)[0],len(addlist)])]) exp.smap=smap exp.samples=tablesamples exp.seqs=tableseqs for idx,cseq in enumerate(exp.seqs): exp.seqdict[cseq]=idx exp.sids=sids exp.origotunames=osnames exp.tax=tax exp.tablefilename=tablename exp.studyname=studyname exp.mapfilename=tablename exp.filters = [tablename] exp.fields = fields exp.datamd5 = datamd5 exp.mapmd5 = mapmd5 colsum=np.sum(exp.data,axis=0,keepdims=False) exp.origreads=list(colsum) # add the original number of reads as a field to the experiment exp.fields.append('origReads') for idx,csamp in enumerate(exp.samples): exp.smap[csamp]['origReads']=str(exp.origreads[idx]) # normalize samples to 10k reads per samples colsum=np.sum(exp.data,axis=0,keepdims=True) okreads=np.where(colsum>0) if np.size(colsum)-np.size(okreads[1])>0: print("Samples with 0 reads: %d" % (np.size(colsum)-np.size(okreads[1]))) if not keepzero: exp=hs.reordersamples(exp,okreads[1]) colsum=np.sum(exp.data,axis=0,keepdims=True) if tabletype=='meta': normalize=False if normalize: exp.data=10000*exp.data/colsum else: if not rawreads: exp.data=10000*exp.data/np.mean(colsum) exp.uniqueid=exp.getexperimentid() if sortit: exp=hs.sortbacteria(exp,logit=False) hs.addcommand(exp,"load",params=params) exp.filters.append('loaded table=%s, map=%s' % (tablename,mapname)) return(exp)