示例#1
0
 def mergeFiles(self):
     files = [
         sh.loadDf(self.splitOutDir + '/' + str(i) + quantile.myPostfix).T
         for i in range(1, self.nFiles + 1)
     ]
     concatedFile = pd.concat(files)
     return concatedFile.T
示例#2
0
    def upgradeToSrrDF(self,BioByCellSampleDF,cache=True):
        """
        #implicit export: export file to biosampelAnnotation as srr
        #implicit input: self.bioSampleAnnotation
        note: [IMPORTANT] don't cache for first run
        :return: SRR by UMLS DF
        """

        srrConvertedFname=self.bioSampleAnnotationSrrDir

        if not cache or not os.path.isfile(srrConvertedFname+'.npy'):
            if os.path.isfile(shv.cacheName(srrConvertedFname+'.npy')):
                os.remove(shv.cacheName(srrConvertedFname+'.npy'))
            AllBioSampleIds={genome+'_'+dtype: pd.DataFrame.from_csv(Dir+genome+'/metaData.txt').BioSample for genome in shv.genomeBuild for dtype, Dir in shv.baseDirDict.iteritems()}
            srrToBioSampleS=pd.concat(AllBioSampleIds.values())
            inputS=srrToBioSampleS
            returnDF=shv.createEmptyDf( srrToBioSampleS[srrToBioSampleS.isin(BioByCellSampleDF.index)].index.unique(),BioByCellSampleDF.columns )
            for srr,biosample in inputS.iteritems():
                if biosample in BioByCellSampleDF.index:
                    if srr in returnDF.index:
                        returnDF.loc[srr]=BioByCellSampleDF.loc[biosample]
            shv.exportDf(srrConvertedFname,returnDF)
        else:
            returnDF=shv.loadDf(srrConvertedFname)
        return returnDF
    def split(self, fname, nfiles, splitOutDir, clean=True):

        if not os.path.isdir(splitOutDir):
            os.mkdir(splitOutDir)
        if clean:
            self.cleanDir(splitOutDir)
        trscrptBySrr = sh.loadDf(fname)
        chunkSize = trscrptBySrr.shape[1] / nfiles
        print 'chunkSize: ', chunkSize
        for i in range(nfiles):
            DFindex = i * chunkSize
            lastIndex = (nfiles - 1)
            ub = trscrptBySrr.shape[
                1] if i == lastIndex else DFindex + chunkSize
            subDF = trscrptBySrr.iloc[:, DFindex:ub]
            sh.exportDf(splitOutDir + str(i + 1), subDF)
示例#4
0
    def split(self,
              fname,
              nfiles,
              splitOutDir,
              srrToCUIDir,
              clean=True,
              minN=3):

        if not os.path.isdir(splitOutDir):
            os.makedirs(splitOutDir)
        if clean:
            self.cleanDir(splitOutDir)
        #load transcript DF
        trscrptBySrr = sh.loadDf(fname)
        tmpSrrToTerms = pd.read_pickle(srrToCUIDir)
        srrToTerms = tmpSrrToTerms[tmpSrrToTerms.index.isin(
            trscrptBySrr.columns)]
        #find the Term to Sample Dict
        termToSrrDict = defaultdict(set)
        for srr, term in srrToTerms.iteritems():
            termToSrrDict[term].add(srr)
        #parition the term and sample to files

        filteredDict = {
            term: SRRset
            for term, SRRset in termToSrrDict.iteritems()
            if len(SRRset) >= sh.maxNClusters
        }
        #keep a table of term to ID
        termToId = pd.Series(range(1,
                                   len(filteredDict.keys()) + 1),
                             index=filteredDict.keys())
        termToId.to_pickle(self.termToIdSDir)
        #read in trnascriptome DF

        print trscrptBySrr.shape
        for term, i in termToId.iteritems():
            srrSet = filteredDict[term]

            subDF = trscrptBySrr.loc[:, trscrptBySrr.columns.isin(srrSet)]
            sh.exportDf(splitOutDir + str(i), subDF)
        #spit the key file into different files
        return termToId
示例#5
0
    def getSubUMLSDF(self,terms,threshold=500,filterOverlyPrevalent=True):
        """
        input : terms: list of terms where their children should be incoperated
        output: sub UMLS matrix

        param: filterOverlyPrevalent: filter out the most prevalent terms
        NOTE: this one has no synonyms optimzation
        """
        dfFname=param.CellOntologyToUMLSDFDir
        if not os.path.isfile(dfFname+'.npy'):
            self._init_UMLSDF(dfFname)
        nodes= reduce(lambda a,term: a.union(set(self.childrenTermsOf(term)))  , terms,set())
        cellOntoUMLSDF=shv.loadDf(dfFname).abs()
        myUMLSSubDF=cellOntoUMLSDF.loc[nodes]
        myUMLSSubDF=myUMLSSubDF.fillna(0)
        denseSubDF=myUMLSSubDF.loc[:,myUMLSSubDF.any(axis=0)]
        binarizedDense=(denseSubDF>threshold).astype(np.float64)

        if filterOverlyPrevalent:
            s=binarizedDense.sum(axis=0)
            binarizedDense=binarizedDense[ s[~(s==s.max())].index]

        return binarizedDense
示例#6
0
def func(Dir):

    #[IMPORTANT]TAKE THIS LINE OUT WHEN DONE
    tissueSubDF=sh.loadDf(Dir)
    outDF=tissueSubDF.rank(axis=0)
    sh.exportDf(Dir+myPostfix,outDF)
示例#7
0
import sys
import numpy as np
sys.path.append('/cellar/users/btsui/Project/METAMAP/code/metamap')
import sharedVariable as shv
import pandas as pd
import os
inDir = '/oasis/btsui/Data/SRA/MATRIX/DATA/hgGRC38/'
inDfName = 'allSRAspliceVariantMaxMatrix'

inDF = inDir + inDfName
sraByTrscrptDF = shv.loadDf(inDF).astype(np.float64)
profileLocation = '/cellar/users/btsui/Data/nrnb01_nobackup/METAMAP/'
profileMembershipExtd = '.kmeanMembership.merged.pyc'
profileMembershipSDir = profileLocation + inDfName + profileMembershipExtd

outDfName = profileLocation + inDfName + '.var.merged.pyc'

profileMembershipS = pd.read_pickle(profileMembershipSDir)
from collections import defaultdict
profileToSrrDict = defaultdict(set)
for srr, profile in profileMembershipS.iteritems():
    profileToSrrDict[profile].add(srr)

myDict = {}
for profile, Srrs in profileToSrrDict.iteritems():
    subDf = sraByTrscrptDF.loc[:, Srrs]
    myDict[profile] = pd.Series(subDf.as_matrix().var(axis=1),
                                index=subDf.index)
trscrptByProfileVarDf = pd.DataFrame(myDict)
trscrptByProfileVarDf.to_pickle(outDfName)