def mergeFiles(self): files = [ sh.loadDf(self.splitOutDir + '/' + str(i) + quantile.myPostfix).T for i in range(1, self.nFiles + 1) ] concatedFile = pd.concat(files) return concatedFile.T
def upgradeToSrrDF(self,BioByCellSampleDF,cache=True): """ #implicit export: export file to biosampelAnnotation as srr #implicit input: self.bioSampleAnnotation note: [IMPORTANT] don't cache for first run :return: SRR by UMLS DF """ srrConvertedFname=self.bioSampleAnnotationSrrDir if not cache or not os.path.isfile(srrConvertedFname+'.npy'): if os.path.isfile(shv.cacheName(srrConvertedFname+'.npy')): os.remove(shv.cacheName(srrConvertedFname+'.npy')) AllBioSampleIds={genome+'_'+dtype: pd.DataFrame.from_csv(Dir+genome+'/metaData.txt').BioSample for genome in shv.genomeBuild for dtype, Dir in shv.baseDirDict.iteritems()} srrToBioSampleS=pd.concat(AllBioSampleIds.values()) inputS=srrToBioSampleS returnDF=shv.createEmptyDf( srrToBioSampleS[srrToBioSampleS.isin(BioByCellSampleDF.index)].index.unique(),BioByCellSampleDF.columns ) for srr,biosample in inputS.iteritems(): if biosample in BioByCellSampleDF.index: if srr in returnDF.index: returnDF.loc[srr]=BioByCellSampleDF.loc[biosample] shv.exportDf(srrConvertedFname,returnDF) else: returnDF=shv.loadDf(srrConvertedFname) return returnDF
def split(self, fname, nfiles, splitOutDir, clean=True): if not os.path.isdir(splitOutDir): os.mkdir(splitOutDir) if clean: self.cleanDir(splitOutDir) trscrptBySrr = sh.loadDf(fname) chunkSize = trscrptBySrr.shape[1] / nfiles print 'chunkSize: ', chunkSize for i in range(nfiles): DFindex = i * chunkSize lastIndex = (nfiles - 1) ub = trscrptBySrr.shape[ 1] if i == lastIndex else DFindex + chunkSize subDF = trscrptBySrr.iloc[:, DFindex:ub] sh.exportDf(splitOutDir + str(i + 1), subDF)
def split(self, fname, nfiles, splitOutDir, srrToCUIDir, clean=True, minN=3): if not os.path.isdir(splitOutDir): os.makedirs(splitOutDir) if clean: self.cleanDir(splitOutDir) #load transcript DF trscrptBySrr = sh.loadDf(fname) tmpSrrToTerms = pd.read_pickle(srrToCUIDir) srrToTerms = tmpSrrToTerms[tmpSrrToTerms.index.isin( trscrptBySrr.columns)] #find the Term to Sample Dict termToSrrDict = defaultdict(set) for srr, term in srrToTerms.iteritems(): termToSrrDict[term].add(srr) #parition the term and sample to files filteredDict = { term: SRRset for term, SRRset in termToSrrDict.iteritems() if len(SRRset) >= sh.maxNClusters } #keep a table of term to ID termToId = pd.Series(range(1, len(filteredDict.keys()) + 1), index=filteredDict.keys()) termToId.to_pickle(self.termToIdSDir) #read in trnascriptome DF print trscrptBySrr.shape for term, i in termToId.iteritems(): srrSet = filteredDict[term] subDF = trscrptBySrr.loc[:, trscrptBySrr.columns.isin(srrSet)] sh.exportDf(splitOutDir + str(i), subDF) #spit the key file into different files return termToId
def getSubUMLSDF(self,terms,threshold=500,filterOverlyPrevalent=True): """ input : terms: list of terms where their children should be incoperated output: sub UMLS matrix param: filterOverlyPrevalent: filter out the most prevalent terms NOTE: this one has no synonyms optimzation """ dfFname=param.CellOntologyToUMLSDFDir if not os.path.isfile(dfFname+'.npy'): self._init_UMLSDF(dfFname) nodes= reduce(lambda a,term: a.union(set(self.childrenTermsOf(term))) , terms,set()) cellOntoUMLSDF=shv.loadDf(dfFname).abs() myUMLSSubDF=cellOntoUMLSDF.loc[nodes] myUMLSSubDF=myUMLSSubDF.fillna(0) denseSubDF=myUMLSSubDF.loc[:,myUMLSSubDF.any(axis=0)] binarizedDense=(denseSubDF>threshold).astype(np.float64) if filterOverlyPrevalent: s=binarizedDense.sum(axis=0) binarizedDense=binarizedDense[ s[~(s==s.max())].index] return binarizedDense
def func(Dir): #[IMPORTANT]TAKE THIS LINE OUT WHEN DONE tissueSubDF=sh.loadDf(Dir) outDF=tissueSubDF.rank(axis=0) sh.exportDf(Dir+myPostfix,outDF)
import sys import numpy as np sys.path.append('/cellar/users/btsui/Project/METAMAP/code/metamap') import sharedVariable as shv import pandas as pd import os inDir = '/oasis/btsui/Data/SRA/MATRIX/DATA/hgGRC38/' inDfName = 'allSRAspliceVariantMaxMatrix' inDF = inDir + inDfName sraByTrscrptDF = shv.loadDf(inDF).astype(np.float64) profileLocation = '/cellar/users/btsui/Data/nrnb01_nobackup/METAMAP/' profileMembershipExtd = '.kmeanMembership.merged.pyc' profileMembershipSDir = profileLocation + inDfName + profileMembershipExtd outDfName = profileLocation + inDfName + '.var.merged.pyc' profileMembershipS = pd.read_pickle(profileMembershipSDir) from collections import defaultdict profileToSrrDict = defaultdict(set) for srr, profile in profileMembershipS.iteritems(): profileToSrrDict[profile].add(srr) myDict = {} for profile, Srrs in profileToSrrDict.iteritems(): subDf = sraByTrscrptDF.loc[:, Srrs] myDict[profile] = pd.Series(subDf.as_matrix().var(axis=1), index=subDf.index) trscrptByProfileVarDf = pd.DataFrame(myDict) trscrptByProfileVarDf.to_pickle(outDfName)