Пример #1
0
    def test_byteToStringAndBackInDifferentPatterns(self):
        s3 = ActualS3Interface.ActualS3InterfaceFactory()

        setupText = (
            """
            let ds = Vector.range(3000000000, {UInt8(_%100)});

            let dat = Vector.range(100, fun(block) {
                Vector.range(1000000, fun(o) { let base = block * 10000000 + o * 10; (base, base + 10) })
                });

            (ds, dat, dat.sum())
            """
            )

        setupResults, simulation = self.computeUsingSeveralWorkers(
                setupText,
                s3,
                1,
                memoryLimitMb=45 * 1024,
                threadCount=30,
                returnSimulation=True,
                ioTaskThreadOverride=8,
                useInMemoryCache=False,
                timeout=30,
                objectStore=self.createObjectStore(s3)
                )

        try:
            ds, dat, datSum = setupResults.asResult.result

            t0 = time.time()
            result = simulation.compute(
                "size(datSum ~~ { ds[_[0],_[1]].dataAsString }) == size(datSum)",
                timeout=120,
                ds=ds,
                dat=dat,
                datSum=datSum
                )
            PerformanceTestReporter.recordTest(
                "python.BigBox.DataAsString.FlatVector",
                time.time() - t0,
                None
                )

            t0 = time.time()
            result = simulation.compute(
                "size(dat ~~ {_ ~~ { ds[_[0],_[1]].dataAsString } }) == size(dat)",
                timeout=120,
                ds=ds,
                dat=dat,
                datSum=datSum
                )
            PerformanceTestReporter.recordTest(
                "python.BigBox.DataAsString.NestedVector",
                time.time() - t0,
                None
                )
        finally:
            simulation.teardown()
Пример #2
0
def createService(args):
    callbackSchedulerFactory = CallbackScheduler.createSimpleCallbackSchedulerFactory(
    )
    callbackScheduler = callbackSchedulerFactory.createScheduler(
        'ufora-worker', 1)
    channelListener = MultiChannelListener(
        callbackScheduler, [args.base_port, args.base_port + 1])

    sharedStateViewFactory = ViewFactory.ViewFactory.TcpViewFactory(
        callbackSchedulerFactory.createScheduler('SharedState', 1),
        args.manager_address, int(args.manager_port))

    channelFactory = TcpChannelFactory.TcpStringChannelFactory(
        callbackScheduler)

    diagnostics_dir = os.getenv("UFORA_WORKER_DIAGNOSTICS_DIR")
    eventHandler = diagnostics_dir and createEventHandler(
        diagnostics_dir,
        callbackSchedulerFactory.createScheduler("ufora-worker-event-handler",
                                                 1))

    own_address = args.own_address or get_own_ip()
    print "Listening on", own_address, "ports:", args.base_port, "and", args.base_port + 1

    return CumulusService.CumulusService(
        own_address,
        channelListener,
        channelFactory,
        eventHandler,
        callbackScheduler,
        diagnostics_dir,
        Setup.config(),
        viewFactory=sharedStateViewFactory,
        s3InterfaceFactory=ActualS3Interface.ActualS3InterfaceFactory(),
        objectStore=NullObjectStore.NullObjectStore())
Пример #3
0
    def downloadTaxiData(self,
                         filecount,
                         parse=False,
                         workers=1,
                         threadsPerWorker=30,
                         downloaderThreads=8):
        s3 = ActualS3Interface.ActualS3InterfaceFactory()
        
        bucketName = self.getTestDataBucket()

        result, simulation = self.computeUsingSeveralWorkers(
            "1+1",
            s3,
            workers,
            memoryLimitMb=45 * 1024 / workers,
            threadCount=threadsPerWorker,
            returnSimulation=True,
            ioTaskThreadOverride=downloaderThreads,
            useInMemoryCache=False,
            objectStore=self.createObjectStore(s3)
            )

        try:
            dsText = (
                """let ds = """ + "+".join([
                    'datasets.s3("%s", "taxi_month_%s.csv")' % (bucketName, ix) for ix in range(1, filecount+1)
                    ]) + ";"
                )

            text = dsText + "(ds, ds.sum(), size(ds))"

            downloadTimeStart = time.time()
            result = simulation.compute(text, timeout=240)
            self.assertTrue(result.isResult())
            downloadTimeEnd = time.time()
            ds, dsSum, bytecount = result.asResult.result

            if parse:
                parseTimeStart = time.time()
                result = simulation.compute("size(parsing.csv(ds))", timeout=240, ds=ds)
                parseTimeEnd = time.time()

                self.assertTrue(result.isResult())

                PerformanceTestReporter.recordTest(
                    "python.BigBox.LargeS3.ParseTaxidata." + str(filecount),
                    parseTimeEnd - parseTimeStart,
                    None
                    )
            else:
                bytecount = bytecount.pyval
                PerformanceTestReporter.recordTest(
                    "python.BigBox.LargeS3.TaxiSecondsPerGB." + str(filecount),
                    (downloadTimeEnd - downloadTimeStart) / (bytecount / 1024 / 1024.0 / 1024.0),
                    None
                    )
        finally:
            simulation.teardown()
Пример #4
0
    def create_executor(self, **kwds):
        s3 = ActualS3Interface.ActualS3InterfaceFactory()
        if 'threadsPerWorker' not in kwds:
            kwds['threadsPerWorker'] = 30
        if 'memoryPerWorkerMB' not in kwds:
            kwds['memoryPerWorkerMB'] = 40000

        return InMemorySimulationExecutorFactory.create_executor(s3Service=s3,
                                                                 **kwds)
Пример #5
0
def createService(args):
    callbackSchedulerFactory = CallbackScheduler.createSimpleCallbackSchedulerFactory()
    callbackScheduler = callbackSchedulerFactory.createScheduler('ufora-worker', 1)
    channelListener = MultiChannelListener(callbackScheduler,
                                           [args.base_port, args.base_port + 1])

    sharedStateViewFactory = ViewFactory.ViewFactory.TcpViewFactory(
        callbackSchedulerFactory.createScheduler('SharedState', 1),
        args.manager_address,
        int(args.manager_port)
        )

    channelFactory = TcpChannelFactory.TcpStringChannelFactory(callbackScheduler)

    diagnostics_dir = os.getenv("UFORA_WORKER_DIAGNOSTICS_DIR")
    eventHandler = diagnostics_dir and createEventHandler(
        diagnostics_dir,
        callbackSchedulerFactory.createScheduler("ufora-worker-event-handler", 1)
        )

    own_address = args.own_address or get_own_ip()
    print "Listening on", own_address, "ports:", args.base_port, "and", args.base_port+1

    config = Setup.config()
    print "RAM cache of %d / %d MB and %d threads. Track tcmalloc: %s" % (
        config.cumulusVectorRamCacheMB,
        config.cumulusMaxRamCacheMB,
        config.cumulusServiceThreadCount,
        config.cumulusTrackTcmalloc
        )

    print "Ufora store at %s:%s" % (args.manager_address, args.manager_port)

    s3InterfaceFactory = ActualS3Interface.ActualS3InterfaceFactory()
    print "PythonIoTasks threads: %d. Out of process: %s" % (
        config.externalDatasetLoaderServiceThreads,
        s3InterfaceFactory.isCompatibleWithOutOfProcessDownloadPool
        )

    return CumulusService.CumulusService(
        own_address,
        channelListener,
        channelFactory,
        eventHandler,
        callbackScheduler,
        diagnostics_dir,
        Setup.config(),
        viewFactory=sharedStateViewFactory,
        s3InterfaceFactory=s3InterfaceFactory,
        objectStore=NullObjectStore.NullObjectStore()
        )
Пример #6
0
    def test_disk_scans(self):
        s3 = ActualS3Interface.ActualS3InterfaceFactory()
        objectStore = S3ObjectStore.S3ObjectStore(
            s3, Setup.config().userDataS3Bucket, prefix="test_object_cache/")

        _, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            "1+1",
            s3,
            1,
            memoryLimitMb=1 * 1024,
            threadCount=30,
            returnSimulation=True,
            ioTaskThreadOverride=8,
            objectStore=objectStore,
            useInMemoryCache=False  #use an actual disk cache for this
        )

        try:
            gigabytes = 8

            t0 = time.time()

            resultVectors = []
            for ix in range(gigabytes):
                result = simulation.compute("Vector.range(125000000 + %s)" %
                                            ix,
                                            timeout=120)
                resultVectors.append(result.asResult.result)

            t1 = time.time()

            intResults = []
            for vec in resultVectors:
                result = simulation.compute("v.sum()", timeout=120, v=vec)
                intResults.append(result.asResult.result.pyval)

            self.assertTrue(len(intResults) == gigabytes)

            PerformanceTestReporter.recordTest("python.BigBox.Disk.Write.10GB",
                                               t1 - t0, None)

            PerformanceTestReporter.recordTest(
                "python.BigBox.Disk.WriteAndScan.%sGB" % gigabytes,
                time.time() - t0, None)
        finally:
            simulation.teardown()
Пример #7
0
    def writeToS3Test(self, bytecount, pageSizeOverride=1024*1024, workers=1, memoryLimitMb=45 * 1024,threadCount=30):
        text = """Vector.range(__bytecount__, {UInt8(_%100)}).paged"""

        s3 = ActualS3Interface.ActualS3InterfaceFactory()

        keyGuid = "bigbox-test-key-" + str(uuid.uuid4())
        
        try:
            setupText = text.replace('__bytecount__', str(bytecount))

            setupResults, simulation = self.computeUsingSeveralWorkers(
                setupText,
                s3,
                workers,
                memoryLimitMb=memoryLimitMb,
                threadCount=threadCount,
                returnSimulation=True,
                ioTaskThreadOverride=8,
                useInMemoryCache=False,
                timeout=30,
                objectStore=self.createObjectStore(s3),
                pageSizeOverride=pageSizeOverride
                )

            result = simulation.executeExternalIoTask(
                CumulusNative.ExternalIoTask.WriteCharBigvecToS3(
                    setupResults.asResult.result.getVectorBigvecGuid(),
                    CumulusNative.S3KeyAndCredentials(
                        self.getTestDataBucket(),
                        keyGuid,
                        "",
                        "",
                        ""
                        )
                    ),
                timeout=60
                )

            self.assertTrue(result.isSuccess(), result)

            assert s3().getKeySize(self.getTestDataBucket(), keyGuid) == bytecount
        finally:
            try:
                s3().deleteKey(self.getTestDataBucket(), keyGuid)
            except:
                logging.warn("Failed to cleanup the test key: %s", traceback.format_exc())
Пример #8
0
def getCurrentS3Interface():
    return ActualS3Interface.ActualS3InterfaceFactory()