Exemplo n.º 1
0
    def run(self):
        ext = 'pickle'
        files = iseqlib.getfiles(self.indir, ext)
        sizes = []
        size2stats = {} #key = size, val = averStats
        for file in files:
            size = int(file.rstrip(ext).rstrip('.'))
            sizes.append(size)
            stats = pickle.load( gzip.open(os.path.join(self.indir, file), "rb") )
            size2stats[size] = stats
        sizes.sort()

        #output summary file of the sample:
        outfile = os.path.join( self.outdir, "%s.txt" %self.name )
        f = open(outfile, 'w')
        f.write("Index")
        for s in sizes:
            f.write("\t%d\tStd" %s)
        metrics = self.metrics
        metricsStd = [m + "Std" for m in metrics]
        for i, metric in enumerate(metrics):
            f.write("\n%s" %metric)
            for size in sizes:
                avr = size2stats[size][metric]
                std = size2stats[size][metricsStd[i]]
                f.write("\t%f\t%f" %(avr, std))
        f.write("\n")
        f.close()

        #pickle size2stats to temporary output directory
        picklefile = os.path.join(self.tempOutdir, "%s.pickle" % self.name)
        pickle.dump( size2stats, gzip.open(picklefile, "wb") )
Exemplo n.º 2
0
    def run(self):
        globalTempDir = self.getGlobalTempDir()
        ext = "pickle"
        files = iseqlib.getfiles(self.indir, ext)
        samples = [file.split('.')[0] for file in files]

        for sample in samples:
            samplefile = os.path.join(self.indir, "%s.%s"%(sample, ext))
            sampledir = os.path.join(globalTempDir, sample)
            system("mkdir -p %s" %sampledir)
            if self.options.sampling:
                for i in xrange(self.options.numsam): #sampling a number of times
                    samplingdir = os.path.join(sampledir, "%d" %i)
                    system("mkdir -p %s" %samplingdir)
                    self.addChildTarget( Sampling(samplefile, samplingdir, self.options) )
            else:
                tempoutdir = os.path.join(sampledir, "0")
                system("mkdir -p %s" %tempoutdir)
                
                #filtering if selected Vs and/or selected Js were specified
                if self.options.vs or self.options.js:
                    sampleObj = pickle.load( gzip.open(samplefile, "rb") )
                    subsample = iseqlib.filterSampleByGenes(sampleObj, self.options.vs, self.options.js)
                    system("rm %s" %samplefile)
                    pickle.dump( subsample, gzip.open(samplefile, "wb") )

                self.addChildTarget( Analyses(samplefile, tempoutdir, self.options) )
        #Calculate means & standard deviations of samplings
        self.setFollowOnTarget( AverageResults(globalTempDir, self.options) )
Exemplo n.º 3
0
 def run(self):
     #read input fasta files:
     ext = 'fa'
     files = iseqlib.getfiles(self.options.indir, ext)
     globalTempDir = self.getGlobalTempDir()
     for file in files:
         filepath = os.path.join(self.options.indir, file)
         self.addChildTarget( ReadFasta(filepath, globalTempDir, self.options.minReadCount) )
     
     self.setFollowOnTarget( SamplingAndAnalyses(globalTempDir, self.options) )
Exemplo n.º 4
0
 def run(self):
     singleOutdir = os.path.join(self.options.outdir, "diversity")
     system("mkdir -p %s" %singleOutdir)
     globalTempDir = self.getGlobalTempDir()
     ext = 'pickle'
     samples = iseqlib.getfiles(self.samdir, ext)
     for sample in samples: #Each sample
         samplename = sample.rstrip(ext).rstrip('.')
         self.addChildTarget( SampleSingleAnalyses(globalTempDir, samplename, self.samdir, self.options, singleOutdir) )
     # R --no-save --no-restore --args adapt16D-adapt11D.txt < diversityPlot.R 
     self.setFollowOnTarget( SummarySingle(globalTempDir, singleOutdir, self.options.diversityIndices) )
Exemplo n.º 5
0
def filterSamples(indir, vs, js):
    if not vs and not js:
        return
    ext = 'pickle'
    files = iseqlib.getfiles(indir, ext)
    for file in files:
        filepath = os.path.join(indir, file)
        sample = pickle.load( gzip.open(filepath, "rb") )
        subsample = iseqlib.filterSampleByGenes(sample, vs, js)
        system("rm %s" %filepath)
        pickle.dump(subsample, gzip.open(filepath, "wb"))
    return
Exemplo n.º 6
0
    def run(self):
        globalTempDir = self.getGlobalTempDir()
        ext = "pickle"
        files = iseqlib.getfiles(self.indir, ext)
        samples = [ '.'.join(file.split('.')[:-1]) for file in files ]
        
        for sample in samples:
            samplefile = os.path.join(self.indir, "%s.%s" %(sample, ext))
            outfile = os.path.join(globalTempDir, "%s.pickle" %sample) #temp/sample.pickle
            self.addChildTarget( Sampling(samplefile, outfile, self.options) )

        self.setFollowOnTarget( Analyses(globalTempDir, self.outdir, self.options) )
Exemplo n.º 7
0
    def run(self):
        ext = 'pickle'
        files = iseqlib.getfiles(self.indir, ext)
        sample2size2stats = {}
        sizes = []
        for file in files:
            name = file.rstrip(ext).rstrip('.')
            size2stats = pickle.load( gzip.open(os.path.join(self.indir, file), "rb") )
            for size in size2stats:
                if size not in sizes:
                    sizes.append(size)
            sample2size2stats[name] = size2stats
        sizes.sort()

        #Print summary of each statistic to output files (1 file/statistic where row = samples, columns = sampling size)
        metrics = self.metrics
        metricsStd = [m + "Std" for m in metrics]

        for i, metric in enumerate(metrics):
            outfile = os.path.join(self.outdir, "%s.txt" %metric)
            f = open(outfile, 'w')
            f.write("Sample")
            for size in sizes:
                f.write("\t%d\tStd" %size)
            for sample, size2stats in sample2size2stats.iteritems():
                f.write("\n%s" %sample)
                for size in sizes:
                    if size not in size2stats:
                        f.write("\tNA\tNA")
                    else:
                        s = size2stats[size]
                        f.write("\t%f\t%f" % (s[metric], s[metricsStd[i]]) )
            f.write("\n")
            f.close()

        #Summary of all statictics for each sampling size (each file per sampling size, row=samples, columns = statistics
        for size in sizes:
            outfile = os.path.join(self.outdir, "%d.txt" %size)
            f = open(outfile, 'w')
            f.write("Sample")
            for m in metrics:
                f.write("\t%s\tStd" %m)
            
            for sample, size2stats in sample2size2stats.iteritems():
                f.write("\n%s" %sample)
                for i, metric in enumerate(metrics):
                    if size not in size2stats:
                        f.write("\tNA\tNA")
                    else:
                        s = size2stats[size]
                        f.write("\t%f\t%f" %(s[metric], s[metricsStd[i]]) )
            f.write("\n")
            f.close()
Exemplo n.º 8
0
    def run(self):
        ext = 'fa'
        samples = iseqlib.getfiles(self.options.indir, ext)
        globalTempDir = self.getGlobalTempDir()

        #Read input fasta files and write pickle files into globalTempDir:
        for sample in samples:
            name = sample.rstrip(ext).rstrip('.')
            infile = os.path.join(self.options.indir, sample)
            self.addChildTarget( ReadFasta(infile, globalTempDir, name) )
        
        #After done reading fastas, move to the analyses
        self.setFollowOnTarget( Analyses(globalTempDir, self.options) )
Exemplo n.º 9
0
    def run(self):
        pairOutdir = os.path.join(self.options.outdir, 'similarity')
        system("mkdir -p %s" %pairOutdir)

        globalTempDir = self.getGlobalTempDir()
        ext = 'pickle'
        samples = iseqlib.getfiles(self.samdir, ext)
        samplenames = [s.rstrip(ext).rstrip('.') for s in samples]
        for i in xrange( len(samples) - 1 ):
            s1 = samples[i]
            s1name = samplenames[i] 
            for j in xrange( i+1, len(samples) ):
                s2 = samples[j]
                s2name = samplenames[j]
                self.addChildTarget( SamplePairAnalyses(globalTempDir, s1name, s2name, self.samdir, self.options, pairOutdir) )
        
        self.setFollowOnTarget( SummaryPair(globalTempDir, pairOutdir, self.options.similarityIndices) )
Exemplo n.º 10
0
    def run(self):
        ext = 'pickle'
        picklefiles = iseqlib.getfiles( self.indir, ext )
        statsList = []
        for file in picklefiles:
            stats = pickle.load( gzip.open(os.path.join(self.indir, file), "rb") )
            statsList.append(stats)
        
        avrstats = PairSamplingStats() #initialize avrstats
        #Calculate mean and std using numpy
        #metrics = ['bray', 'horn', 'mountford', 'chao']
        #stds = ['brayStd', 'hornStd', 'mountfordStd', 'chaoStd']
        metrics = self.metrics
        stds = [m + "Std" for m in metrics]

        for i in xrange( len(metrics) ):
            vals = [s[metrics[i]] for s in statsList]
            avrstats[ metrics[i] ] = np.mean( vals )
            avrstats[ stds[i] ] = np.std( vals )

        #Pickle the average stat of this sampling size 
        picklefile = os.path.join(self.outdir, "%d.pickle" %self.samplingsize)
        pickle.dump( avrstats, gzip.open(picklefile, "wb") )
Exemplo n.º 11
0
    def run(self):
        ext = 'pickle'
        picklefiles = iseqlib.getfiles( self.indir, ext )
        statsList = []
        for file in picklefiles:
            stats = pickle.load( gzip.open(os.path.join(self.indir, file), "rb") )
            statsList.append(stats)
        
        avrstats = SingleSamplingStats() #initialize avrstats
        #Calculate mean and std using numpy
        #metrics = ['uniqClones', 'simpson', 'invsimpson', 'shannon', 'fisherAlpha']
        #stds = ['uniqClonesStd', 'simpsonStd', 'invsimpsonStd', 'shannonStd', 'fisherAlphaStd']
        metrics = self.metrics
        stds = [m + "Std" for m in metrics]

        for i in xrange( len(metrics) ):
            vals = [s[metrics[i]] for s in statsList]
            avrstats[ metrics[i] ] = np.mean( vals )
            avrstats[ stds[i] ] = np.std( vals )

        #Pickle the average stat of this sampling size 
        picklefile = os.path.join(self.outdir, "%d.pickle" %self.samplingsize)
        pickle.dump( avrstats, gzip.open(picklefile, "wb") )
Exemplo n.º 12
0
    def run(self):
        ext = 'pickle'
        files = iseqlib.getfiles(self.indir, ext)
        sample2mate2size2stats = {}
        sizes = []

        for file in files:
            name = file.rstrip(ext).rstrip('.')
            samples = name.split('-')
            size2stats = pickle.load( gzip.open(os.path.join(self.indir, file), "rb") )
            for size in size2stats:
                if size not in sizes:
                    sizes.append(size)

            s1 = samples[0]
            s2 = samples[1]
            if s1 not in sample2mate2size2stats:
                sample2mate2size2stats[s1] = { s2: size2stats }
            else:
                sample2mate2size2stats[s1][s2] = size2stats

            if s2 not in sample2mate2size2stats:
                sample2mate2size2stats[s2] = { s1: size2stats }
            else:
                sample2mate2size2stats[s2][s1] = size2stats

            #for i, sample in enumerate(samples):
            #    if sample not in sample2mate2size2stats:
            #        sample2mate2size2stats[sample] = {samples[(i+1) %2]: size2stats}
            #    else:
            #        sample2mate2size2stats[sample][samples[(i+1)%2]] = size2stats
        sizes.sort()

        #Print summary of each statistic to output files (1 file/1 statistic, 1 sampling size where row = samples, cols = samples)
        metrics = self.metrics
        metricsStd = [m + "Std" for m in metrics]

        samples = sorted(sample2mate2size2stats.keys())
        for i, metric in enumerate(metrics):
            for size in sizes:
                outfile = os.path.join(self.outdir, "%s-%d.txt" %(metric, size))
                f = open(outfile, 'w')
                f.write("Sample")
                for s in samples:
                    f.write("\t%s\tStd" %(s) )
                for s in samples:
                    f.write("\n%s" %s)
                    for s2 in samples:
                        if s == s2:
                            f.write("\t-\t-")
                        else:
                            if s in sample2mate2size2stats and s2 in sample2mate2size2stats[s]:
                                size2stats = sample2mate2size2stats[s][s2]
                                if size not in size2stats:
                                    f.write("\t-\t-")
                                else:
                                    stat= size2stats[size]
                                    f.write("\t%f\t%f" %(stat[metric], stat[ metricsStd[i] ] ))
                            else:
                                f.write("\t-\t-")
                f.write("\n")
                f.close()