Пример #1
0
    def test_splitToRowMajor(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        nRows = 100000
        nColumns = 50

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            self.setupScript(nRows, nColumns),
            s3,
            1,
            timeout=30,
            memoryLimitMb=8 * 1024,
            threadCount=4,
            returnSimulation=True,
            useInMemoryCache=False)

        try:
            self.assertTrue(result.isResult())

            setup = result.asResult.result

            t0 = time.time()
            result = simulation.compute(self.splitToRowMajorScript(),
                                        timeout=360,
                                        chunks=setup)
            totalTimeToReturnResult = time.time() - t0

            self.assertTrue(result.isResult())

            PerformanceTestReporter.recordTest(
                "algorithms.text.splitToRowMajor.%srows_%scolumns" %
                (nRows, nColumns), totalTimeToReturnResult, None)

        finally:
            simulation.teardown()
Пример #2
0
    def stringCreationAndSumTest(self, totalStrings, workers, threadsPerWorker, testName):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        #we wish we could actually test that we achieve saturation here but we can't yet.
        text = """Vector.range(%s, String).sum(size)""" % totalStrings

        t0 = time.time()

        _, simulation = \
            self.computeUsingSeveralWorkers(
                "1+1",
                s3,
                workers,
                timeout = 240,
                memoryLimitMb = 55 * 1024 / workers,
                threadCount = threadsPerWorker,
                returnSimulation = True,
                useInMemoryCache = False
                )

        try:
            t0 = time.time()
            result = simulation.compute(text, timeout=240)
            totalTimeToReturnResult = time.time() - t0

            PerformanceTestReporter.recordTest(testName, totalTimeToReturnResult, None)
        finally:
            simulation.teardown()
Пример #3
0
    def validateTimingsForSubprocessCall(
                self,
                testName,
                subprocessArgs,
                meta,
                timeout = 600.0
                ):
        resultCode, out, err = SubprocessRunner.callAndReturnResultAndOutput(
            subprocessArgs,
            timeout = timeout
            )


        if resultCode != 0:
            meta.update({"failure": "subprocess call returned error"})

            if PerformanceTestReporter.isCurrentlyTesting():
                PerformanceTestReporter.recordTest(
                    testName,
                    None,
                    meta
                    )

        assert resultCode == 0, err

        logging.info("Actual time was %s for %s", out[0], subprocessArgs)

        measuredTiming = float(out[0]) / self.baseTiming

        if PerformanceTestReporter.isCurrentlyTesting():
            PerformanceTestReporter.recordTest(
                "fora_lang." + testName,
                float(out[0]),
                meta
                )
Пример #4
0
    def dataCreationTest(self, totalMB, workers, threadsPerWorker, testName):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        #we wish we could actually test that we achieve saturation here but we can't yet.
        text = """size(Vector.range(%s, {_*_}))""" % (totalMB * 1024 * 1024 / 8)

        t0 = time.time()

        result,simulation = self.computeUsingSeveralWorkers(
                "1+1",
                s3,
                workers,
                timeout = 120,
                memoryLimitMb = 55 * 1024 / workers,
                threadCount = threadsPerWorker,
                returnSimulation = True,
                useInMemoryCache = False
                )

        try:
            t0 = time.time()
            result = simulation.compute(text, timeout=120)
            totalTimeToReturnResult = time.time() - t0

            PerformanceTestReporter.recordTest(testName, totalTimeToReturnResult, None)
        finally:
            simulation.teardown()
Пример #5
0
    def stringCreationAndSumTest(self, totalStrings, workers, threadsPerWorker, testName):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        #we wish we could actually test that we achieve saturation here but we can't yet.
        text = """Vector.range(%s, String).sum(size)""" % totalStrings

        t0 = time.time()

        _, simulation = \
            self.computeUsingSeveralWorkers(
                "1+1",
                s3,
                workers,
                timeout = 240,
                memoryLimitMb = 55 * 1024 / workers,
                threadCount = threadsPerWorker,
                returnSimulation = True,
                useInMemoryCache = False
                )

        try:
            t0 = time.time()
            result = simulation.compute(text, timeout=240)
            totalTimeToReturnResult = time.time() - t0

            PerformanceTestReporter.recordTest(testName, totalTimeToReturnResult, None)
        finally:
            simulation.teardown()
Пример #6
0
    def test_transposeToColumnMajor(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        nRows = 100000
        nColumns = 50

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            self.transposeSetupScript(nRows, nColumns),
            s3, 1, timeout = 300, memoryLimitMb = 45 * 1024, threadCount = 30,
            returnSimulation = True, useInMemoryCache = False)

        try:
            self.assertTrue(result.isResult())

            rowMajor = result.asResult.result

            t0 = time.time()
            result = simulation.compute(
                self.transposeRowMajorToColumnMajorScript(nRows, nColumns),
                timeout = 500,
                rowMajor = rowMajor
                )
            totalTimeToReturnResult = time.time() - t0

            self.assertTrue(result.isResult())

            PerformanceTestReporter.recordTest(
                "algorithms.text.transposeRowMajorToColumnMajor.%srows_%scolumns" % (nRows, nColumns),
                totalTimeToReturnResult, None)

        finally:
            simulation.teardown()
Пример #7
0
    def dataCreationTest(self, totalMB, workers, threadsPerWorker, testName):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        #we wish we could actually test that we achieve saturation here but we can't yet.
        text = """size(Vector.range(%s, {_*_}))""" % (totalMB * 1024 * 1024 /
                                                      8)

        t0 = time.time()

        result, simulation = self.computeUsingSeveralWorkers(
            "1+1",
            s3,
            workers,
            timeout=120,
            memoryLimitMb=55 * 1024 / workers,
            threadCount=threadsPerWorker,
            returnSimulation=True,
            useInMemoryCache=False)

        try:
            t0 = time.time()
            result = simulation.compute(text, timeout=120)
            totalTimeToReturnResult = time.time() - t0

            PerformanceTestReporter.recordTest(testName,
                                               totalTimeToReturnResult, None)
        finally:
            simulation.teardown()
Пример #8
0
    def test_byteToStringAndBackInDifferentPatterns(self):
        s3 = ActualS3Interface.ActualS3InterfaceFactory()

        setupText = (
            """
            let ds = Vector.range(3000000000, {UInt8(_%100)});

            let dat = Vector.range(100, fun(block) {
                Vector.range(1000000, fun(o) { let base = block * 10000000 + o * 10; (base, base + 10) })
                });

            (ds, dat, dat.sum())
            """
            )

        setupResults, simulation = self.computeUsingSeveralWorkers(
                setupText,
                s3,
                1,
                memoryLimitMb=45 * 1024,
                threadCount=30,
                returnSimulation=True,
                ioTaskThreadOverride=8,
                useInMemoryCache=False,
                timeout=30,
                objectStore=self.createObjectStore(s3)
                )

        try:
            ds, dat, datSum = setupResults.asResult.result

            t0 = time.time()
            result = simulation.compute(
                "size(datSum ~~ { ds[_[0],_[1]].dataAsString }) == size(datSum)",
                timeout=120,
                ds=ds,
                dat=dat,
                datSum=datSum
                )
            PerformanceTestReporter.recordTest(
                "python.BigBox.DataAsString.FlatVector",
                time.time() - t0,
                None
                )

            t0 = time.time()
            result = simulation.compute(
                "size(dat ~~ {_ ~~ { ds[_[0],_[1]].dataAsString } }) == size(dat)",
                timeout=120,
                ds=ds,
                dat=dat,
                datSum=datSum
                )
            PerformanceTestReporter.recordTest(
                "python.BigBox.DataAsString.NestedVector",
                time.time() - t0,
                None
                )
        finally:
            simulation.teardown()
Пример #9
0
    def downloadTaxiData(self,
                         filecount,
                         parse=False,
                         workers=1,
                         threadsPerWorker=30,
                         downloaderThreads=8):
        s3 = ActualS3Interface.ActualS3InterfaceFactory()
        
        bucketName = self.getTestDataBucket()

        result, simulation = self.computeUsingSeveralWorkers(
            "1+1",
            s3,
            workers,
            memoryLimitMb=45 * 1024 / workers,
            threadCount=threadsPerWorker,
            returnSimulation=True,
            ioTaskThreadOverride=downloaderThreads,
            useInMemoryCache=False,
            objectStore=self.createObjectStore(s3)
            )

        try:
            dsText = (
                """let ds = """ + "+".join([
                    'datasets.s3("%s", "taxi_month_%s.csv")' % (bucketName, ix) for ix in range(1, filecount+1)
                    ]) + ";"
                )

            text = dsText + "(ds, ds.sum(), size(ds))"

            downloadTimeStart = time.time()
            result = simulation.compute(text, timeout=240)
            self.assertTrue(result.isResult())
            downloadTimeEnd = time.time()
            ds, dsSum, bytecount = result.asResult.result

            if parse:
                parseTimeStart = time.time()
                result = simulation.compute("size(parsing.csv(ds))", timeout=240, ds=ds)
                parseTimeEnd = time.time()

                self.assertTrue(result.isResult())

                PerformanceTestReporter.recordTest(
                    "python.BigBox.LargeS3.ParseTaxidata." + str(filecount),
                    parseTimeEnd - parseTimeStart,
                    None
                    )
            else:
                bytecount = bytecount.pyval
                PerformanceTestReporter.recordTest(
                    "python.BigBox.LargeS3.TaxiSecondsPerGB." + str(filecount),
                    (downloadTimeEnd - downloadTimeStart) / (bytecount / 1024 / 1024.0 / 1024.0),
                    None
                    )
        finally:
            simulation.teardown()
Пример #10
0
 def testMemoryUpdate(self):
     measurementTime = self.measurementTime
     for cores in self.coreList:
         for allocSize in self.allocSizeList:
             PerformanceTestReporter.recordTest(
                 "python.BigBox.MemoryUpdate.SecondsPerGB.%sCore_%sMB" % (cores, allocSize),
                 self.measureMemoryUpdatePerformance(measurementTime, 1024 * 1024 * allocSize, cores),
                 None
                 )
Пример #11
0
 def testMmapAllocation(self):
     measurementTime = self.measurementTime
     for cores in self.coreList:
         for allocSize in self.allocSizeList:
             PerformanceTestReporter.recordTest(
                 "python.BigBox.MmapAlloc.SecondsPerGB.%sCore_%sMB" % (cores, allocSize),
                 self.measureMmapPerformance(measurementTime, 1024 * 1024 * allocSize, cores, False),
                 None
                 )
Пример #12
0
    def stringToInt64ParsingTest(self, threads, testName):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        #we wish we could actually test that we achieve saturation here but we can't yet.
        text = """
            let doALoop = fun(x) {
                //pass 's' through a vector so that the compiler can't tell what it is
                let s = ["2013"][0];

                let res = 0
                for ix in sequence(x) {
                    if (ix == 0)
                        s = s + String(ix)

                    res = res + Int64(s) + ix
                    }
                res
                };

            Vector.range(__thread_count__) ~~ {doALoop(20000000 + _)}
            """.replace("__thread_count__", str(threads))

        _, simulation = \
            self.computeUsingSeveralWorkers(
                "1+1",
                s3,
                1,
                timeout = 240,
                memoryLimitMb = 55 * 1024,
                threadCount = 30,
                returnSimulation = True,
                useInMemoryCache = False
                )

        t0 = time.time()

        _, simulation = \
            self.computeUsingSeveralWorkers(
                "1+1",
                s3,
                1,
                timeout = 240,
                memoryLimitMb = 55 * 1024,
                threadCount = 30,
                returnSimulation = True,
                useInMemoryCache = False
                )

        try:
            t0 = time.time()
            result = simulation.compute(text, timeout=240)
            totalTimeToReturnResult = time.time() - t0

            PerformanceTestReporter.recordTest(testName,
                                               totalTimeToReturnResult, None)
        finally:
            simulation.teardown()
Пример #13
0
 def testMemoryUpdate(self):
     measurementTime = self.measurementTime
     for cores in self.coreList:
         for allocSize in self.allocSizeList:
             PerformanceTestReporter.recordTest(
                 "python.BigBox.MemoryUpdate.SecondsPerGB.%sCore_%sMB" %
                 (cores, allocSize),
                 self.measureMemoryUpdatePerformance(
                     measurementTime, 1024 * 1024 * allocSize, cores), None)
Пример #14
0
    def gbmRegressionFittingTest(self, nRows, nColumns, depth, nThreads,
                                 maxBoosts):
        testName = self.getTestName(nRows, nColumns, depth, maxBoosts,
                                    nThreads)

        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            self.dataGenerationScript(nRows, nColumns),
            s3,
            1,
            timeout=360,
            memoryLimitMb=30 * 1024,
            threadCount=nThreads,
            returnSimulation=True,
            useInMemoryCache=False)
        try:
            self.assertTrue(result.isResult())

            dfPredictors, dfResponse = result.asResult.result

            fitter = simulation.compute(
                self.regressionScript(depth, 1),
                timeout=360,
                dfResponse=dfResponse,
                dfPredictors=dfPredictors).asResult.result

            t0 = time.time()

            for nBoosts in range(1, maxBoosts):
                testName = self.getTestName(nRows, nColumns, depth, nBoosts,
                                            nThreads)

                predictions = simulation.compute(
                    "fitter.predictionsAndPseudoresiduals()",
                    timeout=360,
                    fitter=fitter).asResult.result
                totalTimeToReturnResult = time.time() - t0

                PerformanceTestReporter.recordTest(testName + "_predict",
                                                   totalTimeToReturnResult,
                                                   None)

                fitter = simulation.compute(
                    "fitter.nextGivenPredictions(predictions)",
                    timeout=360,
                    fitter=fitter,
                    predictions=predictions).asResult.result
                totalTimeToReturnResult = time.time() - t0

                PerformanceTestReporter.recordTest(testName,
                                                   totalTimeToReturnResult,
                                                   None)

        finally:
            simulation.teardown()
Пример #15
0
    def stringToInt64ParsingTest(self, threads, testName):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        #we wish we could actually test that we achieve saturation here but we can't yet.
        text = """
            let doALoop = fun(x) {
                //pass 's' through a vector so that the compiler can't tell what it is
                let s = ["2013"][0];

                let res = 0
                for ix in sequence(x) {
                    if (ix == 0)
                        s = s + String(ix)

                    res = res + Int64(s) + ix
                    }
                res
                };

            Vector.range(__thread_count__) ~~ {doALoop(20000000 + _)}
            """.replace("__thread_count__", str(threads))

        _, simulation = \
            self.computeUsingSeveralWorkers(
                "1+1",
                s3,
                1,
                timeout = 240,
                memoryLimitMb = 55 * 1024,
                threadCount = 30,
                returnSimulation = True,
                useInMemoryCache = False
                )

        t0 = time.time()

        _, simulation = \
            self.computeUsingSeveralWorkers(
                "1+1",
                s3,
                1,
                timeout = 240,
                memoryLimitMb = 55 * 1024,
                threadCount = 30,
                returnSimulation = True,
                useInMemoryCache = False
                )

        try:
            t0 = time.time()
            result = simulation.compute(text, timeout=240)
            totalTimeToReturnResult = time.time() - t0

            PerformanceTestReporter.recordTest(testName, totalTimeToReturnResult, None)
        finally:
            simulation.teardown()
Пример #16
0
    def test_cant_report_nonsensical_timing(self):
        tempDir = tempfile.mkdtemp()
        tempFile = os.path.join(tempDir, "data.json")

        with SetEnv(
                PerformanceTestReporter.TEST_DATA_LOCATION_ENVIRONMENT_VARIABLE, 
                tempFile
                ):
            with self.assertRaises(Exception):
                PerformanceTestReporter.recordTest("test1","not a float",None)
Пример #17
0
 def testMmapAllocation(self):
     measurementTime = self.measurementTime
     for cores in self.coreList:
         for allocSize in self.allocSizeList:
             PerformanceTestReporter.recordTest(
                 "python.BigBox.MmapAlloc.SecondsPerGB.%sCore_%sMB" %
                 (cores, allocSize),
                 self.measureMmapPerformance(measurementTime,
                                             1024 * 1024 * allocSize, cores,
                                             False), None)
Пример #18
0
    def gbmRegressionFittingTest(self, nRows, nColumns, depth, nThreads, maxBoosts):
        testName = self.getTestName(nRows, nColumns, depth, maxBoosts, nThreads)

        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
                        self.dataGenerationScript(nRows, nColumns),
                        s3,
                        1,
                        timeout = 360,
                        memoryLimitMb = 30 * 1024,
                        threadCount = nThreads,
                        returnSimulation = True,
                        useInMemoryCache = False
                        )
        try:
            self.assertTrue(result.isResult())

            dfPredictors, dfResponse = result.asResult.result

            fitter = simulation.compute(
                self.regressionScript(depth, 1),
                timeout = 360,
                dfResponse = dfResponse,
                dfPredictors = dfPredictors
                ).asResult.result

            t0 = time.time()

            for nBoosts in range(1, maxBoosts):
                testName = self.getTestName(nRows, nColumns, depth, nBoosts, nThreads)

                predictions = simulation.compute(
                    "fitter.predictionsAndPseudoresiduals()",
                    timeout = 360,
                    fitter = fitter
                    ).asResult.result
                totalTimeToReturnResult = time.time() - t0

                PerformanceTestReporter.recordTest(
                    testName + "_predict", totalTimeToReturnResult, None)

                fitter = simulation.compute(
                    "fitter.nextGivenPredictions(predictions)",
                    timeout = 360,
                    fitter = fitter,
                    predictions = predictions
                    ).asResult.result
                totalTimeToReturnResult = time.time() - t0

                PerformanceTestReporter.recordTest(
                    testName, totalTimeToReturnResult, None)

        finally:
            simulation.teardown()
    def test_CalculationRicochet(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        text = """
            let f = fun(ct, seed = 1) {
                let x = 0

                let res = []

                let it = iterator(math.random.UniformReal(0, size(v), seed))

                for ix in sequence(ct) {
                    let x = Int64(pull it)
                    res = res :: (x / Float64(size(v)), v[x])
                    }

                return res
                }

            v[2]
            f(__count__,__seed__)
            """

        vResult, sim = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            "Vector.range(125000000, math.log)",
            s3,
            4,
            timeout=120,
            memoryLimitMb=400,
            threadCount=1,
            useInMemoryCache=True,
            returnSimulation=True)

        try:
            v = vResult.asResult.result

            t0 = time.time()
            sim.compute(text.replace("__seed__",
                                     "1").replace("__count__", "1000"),
                        timeout=120,
                        v=v)
            PerformanceTestReporter.recordTest(
                "python.InMemoryCumulus.Ricochet1000.Pass1",
                time.time() - t0, None)

            t0 = time.time()
            sim.compute(text.replace("__seed__",
                                     "2").replace("__count__", "1000"),
                        timeout=120,
                        v=v)
            PerformanceTestReporter.recordTest(
                "python.InMemoryCumulus.Ricochet1000.Pass2",
                time.time() - t0, None)
        finally:
            sim.teardown()
Пример #20
0
    def test_effectiveParallelism(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        #do a burn-in run
        self.computeUsingSeveralWorkers("""
                let v = Vector.range(5000000, { (1,_) } );

                let f = fun(ix) {
                    let res = 0
                    for x in sequence( (ix - 2000) >>> 0, ix )
                        res = res + size(v[x])
                    res
                    }

                Vector.range(size(v),  f).sum()

                """,
                                        s3,
                                        2,
                                        wantsStats=True,
                                        timeout=240,
                                        memoryLimitMb=500)[1]

        t0 = time.time()

        stats = self.computeUsingSeveralWorkers("""
                let v = Vector.range(5000000, { (1,_) } );

                let f = fun(ix) {
                    let res = 0
                    for x in sequence( (ix - 2000) >>> 0, ix )
                        res = res + size(v[x])
                    res
                    }

                Vector.range(size(v),  f).sum()

                """,
                                                s3,
                                                2,
                                                wantsStats=True,
                                                timeout=240,
                                                memoryLimitMb=500)[1]

        timeElapsed = time.time() - t0
        totalTime = stats.timeSpentInInterpreter + stats.timeSpentInCompiler
        effParallelism = totalTime / timeElapsed

        PerformanceTestReporter.recordTest(
            "python.cumulus.EffectiveParallelism.elapsed", timeElapsed, None)

        PerformanceTestReporter.recordTest(
            "python.cumulus.EffectiveParallelism.effectiveCores",
            effParallelism, {},
            units='count')
Пример #21
0
    def gbmRegressionFittingTest(self,
                                 nRows,
                                 nColumns,
                                 depth,
                                 nThreads,
                                 nBoosts,
                                 copies,
                                 report=True):

        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            self.dataGenerationScript(nRows, nColumns),
            s3,
            1,
            timeout=360,
            memoryLimitMb=30 * 1024,
            threadCount=nThreads,
            returnSimulation=True,
            useInMemoryCache=False)
        try:
            self.assertTrue(result.isResult())

            dfPredictors, dfResponse = result.asResult.result

            builder = simulation.compute(
                self.regressionScript(depth, nBoosts),
                timeout=360,
                dfResponse=dfResponse,
                dfPredictors=dfPredictors).asResult.result

            t0 = time.time()

            testName = self.getTestName(nRows, nColumns, depth, nBoosts,
                                        nThreads, copies)

            result = simulation.compute(
                "Vector.range(%s).apply(fun(x) { builder.fit(dfPredictors[,-x-1], dfResponse[,-x-1]) })"
                % copies,
                timeout=360,
                builder=builder,
                dfPredictors=dfPredictors,
                dfResponse=dfResponse,
            ).asResult.result
            totalTimeToReturnResult = time.time() - t0

            if report:
                PerformanceTestReporter.recordTest(testName,
                                                   totalTimeToReturnResult,
                                                   None)

        finally:
            simulation.teardown()
    def largeDatasetJoinTest(self, mbOfData, columns, threads, machineCount, ratio = .5):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        t0 = time.time()

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
                        self.dataGenerationScript(mbOfData, columns),
                        s3,
                        machineCount,
                        timeout = 360,
                        memoryLimitMb = mbOfData / ratio / machineCount,
                        #channelThroughputMBPerSecond = 100.0,
                        threadCount = threads,
                        returnSimulation = True,
                        useInMemoryCache = False,
                        disableEventHandler = True
                        )

        try:
            self.assertTrue(result.isResult())

            data = result.asResult.result

            joinScript = """
                    let leftDF = dataframe.DataFrame(data[,size(data)/2])
                    let rightDF = dataframe.DataFrame(data[size(data)/2,])

                    size(leftDF.join(rightDF, on: "C0", how: `outer, chunkSize: 1000000, areSorted:true))
                    """

            t0 = time.time()
            result = simulation.compute(
                joinScript,
                timeout=1080,
                data=data
                )
            totalTimeToReturnResult = time.time() - t0

            logging.info("Total time to join: %s", totalTimeToReturnResult)

            self.assertTrue(result.isResult(), result)

            PerformanceTestReporter.recordTest(
                "algorithms.Join.inMemory_%sMB_%scols_%sthreads_%smachines" %
                    (mbOfData, columns,threads,machineCount),
                totalTimeToReturnResult,
                None
                )
        finally:
            dfResponse = None
            dfPredictors = None
            result = None
            simulation.teardown()
Пример #23
0
    def test_disk_scans(self):
        s3 = ActualS3Interface.ActualS3InterfaceFactory()
        objectStore = S3ObjectStore.S3ObjectStore(
            s3,
            Setup.config().userDataS3Bucket,
            prefix="test_object_cache/"
            )

        _, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            "1+1",
            s3,
            1,
            memoryLimitMb=1 * 1024,
            threadCount=30,
            returnSimulation=True,
            ioTaskThreadOverride=8,
            objectStore=objectStore,
            useInMemoryCache=False  #use an actual disk cache for this
            )

        try:
            gigabytes = 8

            t0 = time.time()

            resultVectors = []
            for ix in range(gigabytes):
                result = simulation.compute("Vector.range(125000000 + %s)" % ix, timeout=120)
                resultVectors.append(result.asResult.result)

            t1 = time.time()

            intResults = []
            for vec in resultVectors:
                result = simulation.compute("v.sum()", timeout = 120, v=vec)
                intResults.append(result.asResult.result.pyval)


            self.assertTrue(len(intResults) == gigabytes)

            PerformanceTestReporter.recordTest(
                "python.BigBox.Disk.Write.10GB",
                t1 - t0,
                None
                )

            PerformanceTestReporter.recordTest(
                "python.BigBox.Disk.WriteAndScan.%sGB" % gigabytes,
                time.time() - t0,
                None
                )
        finally:
            simulation.teardown()
Пример #24
0
    def test_effectiveParallelism(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        #do a burn-in run
        self.computeUsingSeveralWorkers("""
                let v = Vector.range(5000000, { (1,_) } );

                let f = fun(ix) {
                    let res = 0
                    for x in sequence( (ix - 2000) >>> 0, ix )
                        res = res + size(v[x])
                    res
                    }

                Vector.range(size(v),  f).sum()

                """, s3, 2, wantsStats = True, timeout=240, memoryLimitMb=500
                )[1]

        t0 = time.time()

        stats = self.computeUsingSeveralWorkers("""
                let v = Vector.range(5000000, { (1,_) } );

                let f = fun(ix) {
                    let res = 0
                    for x in sequence( (ix - 2000) >>> 0, ix )
                        res = res + size(v[x])
                    res
                    }

                Vector.range(size(v),  f).sum()

                """, s3, 2, wantsStats = True, timeout=240, memoryLimitMb=500
                )[1]

        timeElapsed = time.time() - t0
        totalTime = stats.timeSpentInInterpreter + stats.timeSpentInCompiler
        effParallelism = totalTime / timeElapsed

        PerformanceTestReporter.recordTest(
            "python.cumulus.EffectiveParallelism.elapsed",
            timeElapsed,
            None
            )

        PerformanceTestReporter.recordTest(
            "python.cumulus.EffectiveParallelism.effectiveCores",
            effParallelism,
            {},
            units='count'
            )
    def largeDatasetJoinTest(self,
                             mbOfData,
                             columns,
                             threads,
                             machineCount,
                             ratio=.5):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        t0 = time.time()

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            self.dataGenerationScript(mbOfData, columns),
            s3,
            machineCount,
            timeout=360,
            memoryLimitMb=mbOfData / ratio / machineCount,
            #channelThroughputMBPerSecond = 100.0,
            threadCount=threads,
            returnSimulation=True,
            useInMemoryCache=False,
            disableEventHandler=True)

        try:
            self.assertTrue(result.isResult())

            data = result.asResult.result

            joinScript = """
                    let leftDF = dataframe.DataFrame(data[,size(data)/2])
                    let rightDF = dataframe.DataFrame(data[size(data)/2,])

                    size(leftDF.join(rightDF, on: "C0", how: `outer, chunkSize: 1000000, areSorted:true))
                    """

            t0 = time.time()
            result = simulation.compute(joinScript, timeout=1080, data=data)
            totalTimeToReturnResult = time.time() - t0

            logging.info("Total time to join: %s", totalTimeToReturnResult)

            self.assertTrue(result.isResult(), result)

            PerformanceTestReporter.recordTest(
                "algorithms.Join.inMemory_%sMB_%scols_%sthreads_%smachines" %
                (mbOfData, columns, threads, machineCount),
                totalTimeToReturnResult, None)
        finally:
            dfResponse = None
            dfPredictors = None
            result = None
            simulation.teardown()
Пример #26
0
    def largeDatasetBigLMTest(self, mbOfData, columns, threads, testName):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        t0 = time.time()

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            self.dataGenerationScript(mbOfData, columns),
            s3,
            1,
            timeout=360,
            memoryLimitMb=50 * 1024,
            threadCount=threads,
            returnSimulation=True,
            useInMemoryCache=False)

        if testName is not None:
            PerformanceTestReporter.recordTest(testName + "_create",
                                               time.time() - t0, None)

        try:
            self.assertTrue(result.isResult())

            dfResponse, dfPredictors = result.asResult.result

            regressionScript = """
                let model = math.regression.LinearRegression(dfPredictors, dfResponse, fitIntercept: false);
                let coefficients = model.coefficients();
                coefficients[0]
                """

            t0 = time.time()
            result = simulation.compute(regressionScript,
                                        timeout=1080,
                                        dfResponse=dfResponse,
                                        dfPredictors=dfPredictors)
            totalTimeToReturnResult = time.time() - t0

            self.assertTrue(result.isResult())

            if testName is not None:
                PerformanceTestReporter.recordTest(testName,
                                                   totalTimeToReturnResult,
                                                   None)
        finally:
            dfResponse = None
            dfPredictors = None
            result = None
            simulation.teardown()
Пример #27
0
    def largeDatasetBigLMTest(self, mbOfData, columns, threads, testName):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        t0 = time.time()

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
                        self.dataGenerationScript(mbOfData, columns),
                        s3,
                        1,
                        timeout = 360,
                        memoryLimitMb = 50 * 1024,
                        threadCount = threads,
                        returnSimulation = True,
                        useInMemoryCache = False
                        )

        if testName is not None:
            PerformanceTestReporter.recordTest(testName + "_create", time.time() - t0, None)

        try:
            self.assertTrue(result.isResult())

            dfResponse, dfPredictors = result.asResult.result

            regressionScript = """
                let model = math.regression.LinearRegression(dfPredictors, dfResponse, fitIntercept: false);
                let coefficients = model.coefficients();
                coefficients[0]
                """

            t0 = time.time()
            result = simulation.compute(
                regressionScript,
                timeout=1080,
                dfResponse=dfResponse,
                dfPredictors=dfPredictors
                )
            totalTimeToReturnResult = time.time() - t0

            self.assertTrue(result.isResult())

            if testName is not None:
                PerformanceTestReporter.recordTest(testName, totalTimeToReturnResult, None)
        finally:
            dfResponse = None
            dfPredictors = None
            result = None
            simulation.teardown()
Пример #28
0
    def roundtripConvert(self, pyObject, testName):
        try:
            _, timings = self._roundtripConvert(pyObject)

            for k in sorted(timings):
                print k, timings[k]
                PerformanceTestReporter.recordTest(
                    testName=testName+"."+k,
                    elapsedTime=timings[k],
                    metadata=None
                    )
                
        except:
            import traceback
            traceback.print_exc()
            self.assertTrue(False)
Пример #29
0
    def regressionTreePredictionTest(self,
                                     mbOfData,
                                     columns,
                                     testName,
                                     treeDepth,
                                     threads,
                                     minSamplesSplit=50):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            self.dataGenerationScript(mbOfData, columns),
            s3,
            1,
            timeout=360,
            memoryLimitMb=45 * 1024,
            threadCount=threads,
            returnSimulation=True,
            useInMemoryCache=False)
        try:
            self.assertTrue(result.isResult())

            dfResponse, dfPredictors = result.asResult.result

            fitTree = simulation.compute(self.regressionScript(
                treeDepth, minSamplesSplit - 1),
                                         timeout=120,
                                         dfResponse=dfResponse,
                                         dfPredictors=dfPredictors)

            def predictionScript(dirtyFlag=1):
                return ";(%s; fitRegressionTree.predict(dfPredictors));" % dirtyFlag

            t0 = time.time()
            result = simulation.compute(
                predictionScript(),
                timeout=120,
                dfPredictors=dfPredictors,
                fitRegressionTree=fitTree.asResult.result)
            totalTimeToReturnResult = time.time() - t0

            self.assertTrue(result.isResult())

            PerformanceTestReporter.recordTest(testName,
                                               totalTimeToReturnResult, None)

        finally:
            simulation.teardown()
Пример #30
0
    def diskThroughputTest(self, gb):
        if os.getenv("CUMULUS_DATA_DIR") is None:
            dataDir = tempfile.mkdtemp()
        else:
            dataDir = os.getenv("CUMULUS_DATA_DIR")
        dataDir = os.path.join(dataDir, str(uuid.uuid4()))

        diskCache = CumulusNative.DiskOfflineCache(
            callbackScheduler,
            dataDir,
            100 * 1024 * 1024 * 1024,
            100000
            )

        fiftyMegabytes = ForaNative.encodeStringInSerializedObject(" " * 1024 * 1024 * 50)

        logging.info("Writing to %s", dataDir)

        try:
            t0 = time.time()
            for ix in range(gb * 20):
                diskCache.store(
                    ForaNative.PageId(HashNative.Hash.sha1(str(ix)), 50 * 1024 * 1024, 50 * 1024 * 1024),
                    fiftyMegabytes
                    )

            PerformanceTestReporter.recordTest(
                "python.BigBox.Disk.Write%sGB" % gb,
                time.time() - t0,
                None
                )

            t0 = time.time()
            for ix in range(gb * 20):
                diskCache.loadIfExists(
                    ForaNative.PageId(HashNative.Hash.sha1(str(ix)), 50 * 1024 * 1024, 50 * 1024 * 1024)
                    )


            PerformanceTestReporter.recordTest(
                "python.BigBox.Disk.Read%sGB" % gb,
                time.time() - t0,
                None
                )

        finally:
            shutil.rmtree(dataDir)
Пример #31
0
    def test_reporting_to_file(self):
        tempDir = tempfile.mkdtemp()
        tempFile = os.path.join(tempDir, "data.json")

        with SetEnv(
                PerformanceTestReporter.TEST_DATA_LOCATION_ENVIRONMENT_VARIABLE, 
                tempFile
                ):
            PerformanceTestReporter.recordTest("test1.result", 10.0, {"some":"metadata"})
            PerformanceTestReporter.recordTest("test1.result", None, {"some":"metadata"})

        testData = PerformanceTestReporter.loadTestsFromFile(tempFile)

        self.assertEqual(testData,
            [{"name":"test1.result", "time":10.0, "metadata": {"some":"metadata"}},
             {"name":"test1.result", "time":None, "metadata": {"some":"metadata"}}
             ])
Пример #32
0
    def test_disk_scans(self):
        s3 = ActualS3Interface.ActualS3InterfaceFactory()
        objectStore = S3ObjectStore.S3ObjectStore(
            s3, Setup.config().userDataS3Bucket, prefix="test_object_cache/")

        _, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            "1+1",
            s3,
            1,
            memoryLimitMb=1 * 1024,
            threadCount=30,
            returnSimulation=True,
            ioTaskThreadOverride=8,
            objectStore=objectStore,
            useInMemoryCache=False  #use an actual disk cache for this
        )

        try:
            gigabytes = 8

            t0 = time.time()

            resultVectors = []
            for ix in range(gigabytes):
                result = simulation.compute("Vector.range(125000000 + %s)" %
                                            ix,
                                            timeout=120)
                resultVectors.append(result.asResult.result)

            t1 = time.time()

            intResults = []
            for vec in resultVectors:
                result = simulation.compute("v.sum()", timeout=120, v=vec)
                intResults.append(result.asResult.result.pyval)

            self.assertTrue(len(intResults) == gigabytes)

            PerformanceTestReporter.recordTest("python.BigBox.Disk.Write.10GB",
                                               t1 - t0, None)

            PerformanceTestReporter.recordTest(
                "python.BigBox.Disk.WriteAndScan.%sGB" % gigabytes,
                time.time() - t0, None)
        finally:
            simulation.teardown()
    def test_CalculationRicochet(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        text = """
            let f = fun(ct, seed = 1) {
                let x = 0

                let res = []

                let it = iterator(math.random.UniformReal(0, size(v), seed))

                for ix in sequence(ct) {
                    let x = Int64(pull it)
                    res = res :: (x / Float64(size(v)), v[x])
                    }

                return res
                }

            v[2]
            f(__count__,__seed__)
            """

        vResult, sim = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            "Vector.range(125000000, math.log)",
            s3,
            4,
            timeout = 120,
            memoryLimitMb=400,
            threadCount = 1,
            useInMemoryCache = True,
            returnSimulation = True
            )

        try:
            v = vResult.asResult.result

            t0 = time.time()
            sim.compute(text.replace("__seed__", "1").replace("__count__", "1000"), timeout = 120, v = v)
            PerformanceTestReporter.recordTest("python.InMemoryCumulus.Ricochet1000.Pass1", time.time() - t0,None)

            t0 = time.time()
            sim.compute(text.replace("__seed__", "2").replace("__count__", "1000"), timeout = 120, v = v)
            PerformanceTestReporter.recordTest("python.InMemoryCumulus.Ricochet1000.Pass2", time.time() - t0,None)
        finally:
            sim.teardown()
Пример #34
0
    def gbmRegressionFittingTest(self, nRows, nColumns, depth, nThreads, nBoosts, copies, report=True):

        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
                        self.dataGenerationScript(nRows, nColumns),
                        s3,
                        1,
                        timeout = 360,
                        memoryLimitMb = 30 * 1024,
                        threadCount = nThreads,
                        returnSimulation = True,
                        useInMemoryCache = False
                        )
        try:
            self.assertTrue(result.isResult())

            dfPredictors, dfResponse = result.asResult.result

            builder = simulation.compute(
                self.regressionScript(depth, nBoosts),
                timeout = 360,
                dfResponse = dfResponse,
                dfPredictors = dfPredictors
                ).asResult.result


            t0 = time.time()

            testName = self.getTestName(nRows, nColumns, depth, nBoosts, nThreads, copies)

            result = simulation.compute(
                "Vector.range(%s).apply(fun(x) { builder.fit(dfPredictors[,-x-1], dfResponse[,-x-1]) })"
                    % copies,
                timeout = 360,
                builder=builder,
                dfPredictors=dfPredictors,
                dfResponse=dfResponse,
                ).asResult.result
            totalTimeToReturnResult = time.time() - t0

            if report:
                PerformanceTestReporter.recordTest(testName, totalTimeToReturnResult, None)

        finally:
            simulation.teardown()
Пример #35
0
    def regressionTreePredictionTest(self, mbOfData, columns, testName,
                                     treeDepth, threads, minSamplesSplit=50):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
                        self.dataGenerationScript(mbOfData, columns),
                        s3,
                        1,
                        timeout = 360,
                        memoryLimitMb = 45 * 1024,
                        threadCount = threads,
                        returnSimulation = True,
                        useInMemoryCache = False
                        )
        try:
            self.assertTrue(result.isResult())

            dfResponse, dfPredictors = result.asResult.result

            fitTree = simulation.compute(
                self.regressionScript(treeDepth, minSamplesSplit - 1),
                timeout=120,
                dfResponse=dfResponse,
                dfPredictors=dfPredictors
                )

            def predictionScript(dirtyFlag=1):
                return ";(%s; fitRegressionTree.predict(dfPredictors));" % dirtyFlag

            t0 = time.time()
            result = simulation.compute(
                predictionScript(),
                timeout=120,
                dfPredictors=dfPredictors,
                fitRegressionTree=fitTree.asResult.result
                )
            totalTimeToReturnResult = time.time() - t0

            self.assertTrue(result.isResult())

            PerformanceTestReporter.recordTest(testName, totalTimeToReturnResult, None)

        finally:
            simulation.teardown()
Пример #36
0
    def stringToDatetimeParsingTest(self, threads, testName):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        #we wish we could actually test that we achieve saturation here but we can't yet.
        text = """
            let s = ["2013-01-01 15:18:10"][0];

            let doALoop = fun(x) {
                let res = 0
                for ix in sequence(x) {
                    res = res + DateTime(s).year
                    }
                res
                };

            Vector.range(__thread_count__) ~~ {doALoop(1000000 + _)}
            """.replace("__thread_count__", str(threads))

        t0 = time.time()

        _, simulation = \
            self.computeUsingSeveralWorkers(
                "1+1",
                s3,
                1,
                timeout = 240,
                memoryLimitMb = 55 * 1024 / workers,
                threadCount = 30,
                returnSimulation = True,
                useInMemoryCache = False
                )

        try:
            t0 = time.time()
            result = simulation.compute(text, timeout=240)
            totalTimeToReturnResult = time.time() - t0

            PerformanceTestReporter.recordTest(testName,
                                               totalTimeToReturnResult, None)
        finally:
            simulation.teardown()
Пример #37
0
    def diskThroughputTest(self, gb):
        if os.getenv("CUMULUS_DATA_DIR") is None:
            dataDir = tempfile.mkdtemp()
        else:
            dataDir = os.getenv("CUMULUS_DATA_DIR")
        dataDir = os.path.join(dataDir, str(uuid.uuid4()))

        diskCache = CumulusNative.DiskOfflineCache(callbackScheduler, dataDir,
                                                   100 * 1024 * 1024 * 1024,
                                                   100000)

        fiftyMegabytes = ForaNative.encodeStringInSerializedObject(" " * 1024 *
                                                                   1024 * 50)

        logging.info("Writing to %s", dataDir)

        try:
            t0 = time.time()
            for ix in range(gb * 20):
                diskCache.store(
                    ForaNative.PageId(HashNative.Hash.sha1(str(ix)),
                                      50 * 1024 * 1024, 50 * 1024 * 1024),
                    fiftyMegabytes)

            PerformanceTestReporter.recordTest(
                "python.BigBox.Disk.Write%sGB" % gb,
                time.time() - t0, None)

            t0 = time.time()
            for ix in range(gb * 20):
                diskCache.loadIfExists(
                    ForaNative.PageId(HashNative.Hash.sha1(str(ix)),
                                      50 * 1024 * 1024, 50 * 1024 * 1024))

            PerformanceTestReporter.recordTest(
                "python.BigBox.Disk.Read%sGB" % gb,
                time.time() - t0, None)

        finally:
            shutil.rmtree(dataDir)
Пример #38
0
    def stringToDatetimeParsingTest(self, threads, testName):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        #we wish we could actually test that we achieve saturation here but we can't yet.
        text = """
            let s = ["2013-01-01 15:18:10"][0];

            let doALoop = fun(x) {
                let res = 0
                for ix in sequence(x) {
                    res = res + DateTime(s).year
                    }
                res
                };

            Vector.range(__thread_count__) ~~ {doALoop(1000000 + _)}
            """.replace("__thread_count__", str(threads))

        t0 = time.time()

        _, simulation = \
            self.computeUsingSeveralWorkers(
                "1+1",
                s3,
                1,
                timeout = 240,
                memoryLimitMb = 55 * 1024 / workers,
                threadCount = 30,
                returnSimulation = True,
                useInMemoryCache = False
                )

        try:
            t0 = time.time()
            result = simulation.compute(text, timeout=240)
            totalTimeToReturnResult = time.time() - t0

            PerformanceTestReporter.recordTest(testName, totalTimeToReturnResult, None)
        finally:
            simulation.teardown()
Пример #39
0
    def loopScalabilityTestTest(self, threads, testName):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()


        text = """
            let doALoop = fun(x) {
                let res = 0
                for ix in sequence(x) {
                    res = res + ix + 1
                    }
                res
                };

            Vector.range(__thread_count__) ~~ {doALoop(1000000000 + _)}
            """.replace("__thread_count__", str(threads))

        t0 = time.time()

        _, simulation = \
            self.computeUsingSeveralWorkers(
                "1+1",
                s3,
                1,
                timeout = 240,
                memoryLimitMb = 55 * 1024,
                threadCount = 30,
                returnSimulation = True,
                useInMemoryCache = False
                )

        try:
            t0 = time.time()
            result = simulation.compute(text, timeout=240)
            totalTimeToReturnResult = time.time() - t0

            PerformanceTestReporter.recordTest(testName, totalTimeToReturnResult, None)
        finally:
            simulation.teardown()
Пример #40
0
    def loopScalabilityTestTest(self, threads, testName):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()


        text = """
            let doALoop = fun(x) {
                let res = 0
                for ix in sequence(x) {
                    res = res + ix + 1
                    }
                res
                };

            Vector.range(__thread_count__) ~~ {doALoop(1000000000 + _)}
            """.replace("__thread_count__", str(threads))

        t0 = time.time()

        _, simulation = \
            self.computeUsingSeveralWorkers(
                "1+1",
                s3,
                1,
                timeout = 240,
                memoryLimitMb = 55 * 1024,
                threadCount = 30,
                returnSimulation = True,
                useInMemoryCache = False
                )

        try:
            t0 = time.time()
            result = simulation.compute(text, timeout=240)
            totalTimeToReturnResult = time.time() - t0

            PerformanceTestReporter.recordTest(testName, totalTimeToReturnResult, None)
        finally:
            simulation.teardown()
Пример #41
0
    def validateTimingsForSubprocessCall(self,
                                         testName,
                                         subprocessArgs,
                                         meta,
                                         timeout=600.0):
        resultCode, out, err = SubprocessRunner.callAndReturnResultAndOutput(
            subprocessArgs, timeout=timeout)

        if resultCode != 0:
            meta.update({"failure": "subprocess call returned error"})

            if PerformanceTestReporter.isCurrentlyTesting():
                PerformanceTestReporter.recordTest(testName, None, meta)

        assert resultCode == 0, err

        logging.info("Actual time was %s for %s", out[0], subprocessArgs)

        measuredTiming = float(out[0]) / self.baseTiming

        if PerformanceTestReporter.isCurrentlyTesting():
            PerformanceTestReporter.recordTest("fora_lang." + testName,
                                               float(out[0]), meta)
Пример #42
0
    def test_disk_read_and_write_perf(self):
        if os.getenv("CUMULUS_DATA_DIR") is None:
            dataDir = tempfile.mkdtemp()
        else:
            dataDir = os.getenv("CUMULUS_DATA_DIR")
        dataDir = os.path.join(dataDir, str(uuid.uuid4()))

        diskCache = CumulusNative.DiskOfflineCache(
            callbackScheduler,
            dataDir,
            100 * 1024 * 1024 * 1024,
            100000
            )

        try:
            fiftyMegabytes = ForaNative.encodeStringInSerializedObject(" " * 1024 * 1024 * 50)

            logging.info("Writing to %s", dataDir)

            storedPageID = ForaNative.PageId(HashNative.Hash.sha1("pageId"), 50 * 1024 * 1024, 50 * 1024 * 1024)

            diskCache.store(storedPageID, fiftyMegabytes)

            t0 = time.time()

            TOTAL_SECONDS = 20.0

            totalReadBytes = [0]
            totalWriteBytes = [0]

            def readerThread():
                while time.time() - t0 < TOTAL_SECONDS:
                    diskCache.loadIfExists(storedPageID)
                    totalReadBytes[0] += 50

            def writerThread():
                ix = 0
                while time.time() - t0 < TOTAL_SECONDS:
                    ix += 1
                    diskCache.store(
                        ForaNative.PageId(HashNative.Hash.sha1(str(ix)), 50 * 1024 * 1024, 50 * 1024 * 1024),
                        fiftyMegabytes
                        )
                    totalWriteBytes[0] += 50

            threads = [
                threading.Thread(target = readerThread),
                threading.Thread(target = writerThread)
                ]

            for t in threads:
                t.start()
            for t in threads:
                t.join()

            PerformanceTestReporter.recordTest(
                "python.BigBox.Disk.ReadAndWrite.Write1GB",
                1024 / (totalWriteBytes[0] / (time.time() - t0)),
                None
                )

            PerformanceTestReporter.recordTest(
                "python.BigBox.Disk.ReadAndWrite.Read1GB",
                1024 / (totalReadBytes[0] / (time.time() - t0)),
                None
                )

        finally:
            shutil.rmtree(dataDir)
Пример #43
0
    def dataframeSumTest(self, mbOfData, colCount, threadCount, recordResults = True):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        t0 = time.time()

        randomColumnsToPick = 10
        totalRowsToSum = 1000000

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
                        self.dataGenerationScript(mbOfData, colCount),
                        s3,
                        count=1,
                        timeout = 360,
                        memoryLimitMb = 10000,
                        threadCount = threadCount,
                        returnSimulation = True,
                        useInMemoryCache = False,
                        channelThroughputMBPerSecond = None
                        )

        try:
            self.assertTrue(result.isResult())

            data = result.asResult.result
            executionScript = ("""
                let randomRowwiseSumFun = fun (row, randomColumnsToPick, baseSeed){
                    let rng = iterator(math.random.MultiplyWithCarry(baseSeed + row.rowIndex()));
                    let tr = nothing;
                    let ix = 0;
                    let rowSize = size(row)
                    while (ix < randomColumnsToPick) {
                        let nextIx = (pull rng) % rowSize;
                        tr = tr + row[nextIx]
                        ix = ix + 1
                        }
                    tr
                }
                let randomColumnsToPick = __subsetSize__;
                let baseSeed = 5;
                sum(0, __rows_to_sum__, fun(ix) { randomRowwiseSumFun(data[ix % size(data)], randomColumnsToPick, baseSeed) })
                """
                .replace("__subsetSize__",str(randomColumnsToPick))
                .replace("__rows_to_sum__",str(totalRowsToSum * threadCount))
                )


            t0 = time.time()
            result = simulation.compute(
                executionScript,
                timeout=1080,
                data=data
                )
            computeDuration = time.time() - t0

            totalValuesAccessed = totalRowsToSum * randomColumnsToPick * threadCount

            totalValuesPerSecondPerThread = totalValuesAccessed * 2 / computeDuration / threadCount

            secondsToDo10MillionPerThread = 10 * 1000000 / totalValuesPerSecondPerThread

            if recordResults:
                PerformanceTestReporter.recordTest(
                    "python.BigBox.RandomColumnAccess.access10mm_%smb_%scols_%sthreads" % (
                        mbOfData,
                        colCount,
                        threadCount
                        ),
                    secondsToDo10MillionPerThread,
                    None
                    )

            self.assertTrue(result.isResult())

            return computeDuration

        finally:
            dfResponse = None
            dfPredictors = None
            result = None
            simulation.teardown()
    def largeDatasetBigLMTest(self,
                              mbOfData,
                              columns,
                              threads,
                              machineCount,
                              ratio=.4):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        t0 = time.time()

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            self.dataGenerationScript(mbOfData, columns),
            s3,
            machineCount,
            timeout=360,
            memoryLimitMb=mbOfData / ratio / machineCount,
            channelThroughputMBPerSecond=100.0,
            threadCount=threads,
            returnSimulation=True,
            useInMemoryCache=False)

        try:
            self.assertTrue(result.isResult())

            dfResponse, dfPredictors = result.asResult.result

            regressionScript = """
                    let model = math.regression.LinearRegression(dfPredictors, dfResponse,coefficientsOnly:true, splitLimit: 50000);
                    let coefficients = model.coefficients();
                    coefficients[0]
                    """

            t0 = time.time()
            result = simulation.compute(regressionScript,
                                        timeout=1080,
                                        dfResponse=dfResponse,
                                        dfPredictors=dfPredictors)
            totalTimeToReturnResult = time.time() - t0

            self.assertTrue(result.isResult(), result)

            self.assertTrue(result.isResult())

            print "Done with the first regression"

            regressionScript2 = """
                    let newCol = dfPredictors.rowApply(fun(row) { math.sin(row[0] ) })
                    let newCol2 = dfPredictors.rowApply(fun(row) { math.sin(row[0] + 1) })
                    let model2 = math.regression.LinearRegression(dfPredictors.addColumn(newCol).addColumn(newCol2), dfResponse, coefficientsOnly:true, splitLimit: 50000)
                    model2.coefficients()[0]
                    """

            result2 = simulation.compute(regressionScript2,
                                         timeout=1080,
                                         dfResponse=dfResponse,
                                         dfPredictors=dfPredictors)

            totalTimeToReturnResult = time.time() - t0

            self.assertTrue(result2.isResult(), result2)

            PerformanceTestReporter.recordTest(
                "algorithms.linearRegression.inMemory_%sMB_%scols_%sthreads_%smachines"
                % (mbOfData, columns, threads, machineCount),
                totalTimeToReturnResult, None)
        finally:
            dfResponse = None
            dfPredictors = None
            result = None
            simulation.teardown()
Пример #45
0
    def dataframeSumTest(self,
                         mbOfData,
                         colCount,
                         threadCount,
                         recordResults=True):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        t0 = time.time()

        randomColumnsToPick = 10
        totalRowsToSum = 1000000

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            self.dataGenerationScript(mbOfData, colCount),
            s3,
            count=1,
            timeout=360,
            memoryLimitMb=10000,
            threadCount=threadCount,
            returnSimulation=True,
            useInMemoryCache=False,
            channelThroughputMBPerSecond=None)

        try:
            self.assertTrue(result.isResult())

            data = result.asResult.result
            executionScript = ("""
                let randomRowwiseSumFun = fun (row, randomColumnsToPick, baseSeed){
                    let rng = iterator(math.random.MultiplyWithCarry(baseSeed + row.rowIndex()));
                    let tr = nothing;
                    let ix = 0;
                    let rowSize = size(row)
                    while (ix < randomColumnsToPick) {
                        let nextIx = (pull rng) % rowSize;
                        tr = tr + row[nextIx]
                        ix = ix + 1
                        }
                    tr
                }
                let randomColumnsToPick = __subsetSize__;
                let baseSeed = 5;
                sum(0, __rows_to_sum__, fun(ix) { randomRowwiseSumFun(data[ix % size(data)], randomColumnsToPick, baseSeed) })
                """.replace("__subsetSize__",
                            str(randomColumnsToPick)).replace(
                                "__rows_to_sum__",
                                str(totalRowsToSum * threadCount)))

            t0 = time.time()
            result = simulation.compute(executionScript,
                                        timeout=1080,
                                        data=data)
            computeDuration = time.time() - t0

            totalValuesAccessed = totalRowsToSum * randomColumnsToPick * threadCount

            totalValuesPerSecondPerThread = totalValuesAccessed * 2 / computeDuration / threadCount

            secondsToDo10MillionPerThread = 10 * 1000000 / totalValuesPerSecondPerThread

            if recordResults:
                PerformanceTestReporter.recordTest(
                    "python.BigBox.RandomColumnAccess.access10mm_%smb_%scols_%sthreads"
                    % (mbOfData, colCount, threadCount),
                    secondsToDo10MillionPerThread, None)

            self.assertTrue(result.isResult())

            return computeDuration

        finally:
            dfResponse = None
            dfPredictors = None
            result = None
            simulation.teardown()
    def largeDatasetBigLMTest(self, mbOfData, columns, threads, machineCount, ratio = .5):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        t0 = time.time()

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
                        self.dataGenerationScript(mbOfData, columns),
                        s3,
                        machineCount,
                        timeout = 360,
                        memoryLimitMb = mbOfData / ratio / machineCount,
                        channelThroughputMBPerSecond = 100.0,
                        threadCount = threads,
                        returnSimulation = True,
                        useInMemoryCache = False
                        )

        try:
            self.assertTrue(result.isResult())

            dfResponse, dfPredictors = result.asResult.result

            regressionScript = """
                    let model = math.regression.LinearRegression(dfPredictors, dfResponse,coefficientsOnly:true);
                    let coefficients = model.coefficients();
                    coefficients[0]
                    """

            t0 = time.time()
            result = simulation.compute(
                regressionScript,
                timeout=1080,
                dfResponse=dfResponse,
                dfPredictors=dfPredictors
                )
            totalTimeToReturnResult = time.time() - t0

            self.assertTrue(result.isResult(), result)

            self.assertTrue(result.isResult())

            print "Done with the first regression"

            regressionScript2 = """
                    let newCol = dfPredictors.rowApply(fun(row) { math.sin(row[0] ) })
                    let newCol2 = dfPredictors.rowApply(fun(row) { math.sin(row[0] + 1) })
                    let model2 = math.regression.LinearRegression(dfPredictors.addColumn(newCol).addColumn(newCol2), dfResponse, coefficientsOnly:true)
                    model2.coefficients()[0]
                    """

            result2 = simulation.compute(
                regressionScript2,
                timeout=1080,
                dfResponse=dfResponse,
                dfPredictors=dfPredictors
                )

            totalTimeToReturnResult = time.time() - t0

            self.assertTrue(result2.isResult(), result2)

            PerformanceTestReporter.recordTest(
                "algorithms.linearRegression.inMemory_%sMB_%scols_%sthreads_%smachines" %
                    (mbOfData, columns,threads,machineCount),
                totalTimeToReturnResult,
                None
                )
        finally:
            dfResponse = None
            dfPredictors = None
            result = None
            simulation.teardown()
Пример #47
0
    def test_disk_read_and_write_perf(self):
        if os.getenv("CUMULUS_DATA_DIR") is None:
            dataDir = tempfile.mkdtemp()
        else:
            dataDir = os.getenv("CUMULUS_DATA_DIR")
        dataDir = os.path.join(dataDir, str(uuid.uuid4()))

        diskCache = CumulusNative.DiskOfflineCache(callbackScheduler, dataDir,
                                                   100 * 1024 * 1024 * 1024,
                                                   100000)

        try:
            fiftyMegabytes = ForaNative.encodeStringInSerializedObject(
                " " * 1024 * 1024 * 50)

            logging.info("Writing to %s", dataDir)

            storedPageID = ForaNative.PageId(HashNative.Hash.sha1("pageId"),
                                             50 * 1024 * 1024,
                                             50 * 1024 * 1024)

            diskCache.store(storedPageID, fiftyMegabytes)

            t0 = time.time()

            TOTAL_SECONDS = 20.0

            totalReadBytes = [0]
            totalWriteBytes = [0]

            def readerThread():
                while time.time() - t0 < TOTAL_SECONDS:
                    diskCache.loadIfExists(storedPageID)
                    totalReadBytes[0] += 50

            def writerThread():
                ix = 0
                while time.time() - t0 < TOTAL_SECONDS:
                    ix += 1
                    diskCache.store(
                        ForaNative.PageId(HashNative.Hash.sha1(str(ix)),
                                          50 * 1024 * 1024, 50 * 1024 * 1024),
                        fiftyMegabytes)
                    totalWriteBytes[0] += 50

            threads = [
                threading.Thread(target=readerThread),
                threading.Thread(target=writerThread)
            ]

            for t in threads:
                t.start()
            for t in threads:
                t.join()

            PerformanceTestReporter.recordTest(
                "python.BigBox.Disk.ReadAndWrite.Write1GB",
                1024 / (totalWriteBytes[0] / (time.time() - t0)), None)

            PerformanceTestReporter.recordTest(
                "python.BigBox.Disk.ReadAndWrite.Read1GB",
                1024 / (totalReadBytes[0] / (time.time() - t0)), None)

        finally:
            shutil.rmtree(dataDir)
Пример #48
0
 def throws_if_not_reporting(self):
     with self.assertRaises(Exception):
         PerformanceTestReporter.recordTest("test1",10.0,None)