def test_byteToStringAndBackInDifferentPatterns(self): s3 = ActualS3Interface.ActualS3InterfaceFactory() setupText = ( """ let ds = Vector.range(3000000000, {UInt8(_%100)}); let dat = Vector.range(100, fun(block) { Vector.range(1000000, fun(o) { let base = block * 10000000 + o * 10; (base, base + 10) }) }); (ds, dat, dat.sum()) """ ) setupResults, simulation = self.computeUsingSeveralWorkers( setupText, s3, 1, memoryLimitMb=45 * 1024, threadCount=30, returnSimulation=True, ioTaskThreadOverride=8, useInMemoryCache=False, timeout=30, objectStore=self.createObjectStore(s3) ) try: ds, dat, datSum = setupResults.asResult.result t0 = time.time() result = simulation.compute( "size(datSum ~~ { ds[_[0],_[1]].dataAsString }) == size(datSum)", timeout=120, ds=ds, dat=dat, datSum=datSum ) PerformanceTestReporter.recordTest( "python.BigBox.DataAsString.FlatVector", time.time() - t0, None ) t0 = time.time() result = simulation.compute( "size(dat ~~ {_ ~~ { ds[_[0],_[1]].dataAsString } }) == size(dat)", timeout=120, ds=ds, dat=dat, datSum=datSum ) PerformanceTestReporter.recordTest( "python.BigBox.DataAsString.NestedVector", time.time() - t0, None ) finally: simulation.teardown()
def createService(args): callbackSchedulerFactory = CallbackScheduler.createSimpleCallbackSchedulerFactory( ) callbackScheduler = callbackSchedulerFactory.createScheduler( 'ufora-worker', 1) channelListener = MultiChannelListener( callbackScheduler, [args.base_port, args.base_port + 1]) sharedStateViewFactory = ViewFactory.ViewFactory.TcpViewFactory( callbackSchedulerFactory.createScheduler('SharedState', 1), args.manager_address, int(args.manager_port)) channelFactory = TcpChannelFactory.TcpStringChannelFactory( callbackScheduler) diagnostics_dir = os.getenv("UFORA_WORKER_DIAGNOSTICS_DIR") eventHandler = diagnostics_dir and createEventHandler( diagnostics_dir, callbackSchedulerFactory.createScheduler("ufora-worker-event-handler", 1)) own_address = args.own_address or get_own_ip() print "Listening on", own_address, "ports:", args.base_port, "and", args.base_port + 1 return CumulusService.CumulusService( own_address, channelListener, channelFactory, eventHandler, callbackScheduler, diagnostics_dir, Setup.config(), viewFactory=sharedStateViewFactory, s3InterfaceFactory=ActualS3Interface.ActualS3InterfaceFactory(), objectStore=NullObjectStore.NullObjectStore())
def downloadTaxiData(self, filecount, parse=False, workers=1, threadsPerWorker=30, downloaderThreads=8): s3 = ActualS3Interface.ActualS3InterfaceFactory() bucketName = self.getTestDataBucket() result, simulation = self.computeUsingSeveralWorkers( "1+1", s3, workers, memoryLimitMb=45 * 1024 / workers, threadCount=threadsPerWorker, returnSimulation=True, ioTaskThreadOverride=downloaderThreads, useInMemoryCache=False, objectStore=self.createObjectStore(s3) ) try: dsText = ( """let ds = """ + "+".join([ 'datasets.s3("%s", "taxi_month_%s.csv")' % (bucketName, ix) for ix in range(1, filecount+1) ]) + ";" ) text = dsText + "(ds, ds.sum(), size(ds))" downloadTimeStart = time.time() result = simulation.compute(text, timeout=240) self.assertTrue(result.isResult()) downloadTimeEnd = time.time() ds, dsSum, bytecount = result.asResult.result if parse: parseTimeStart = time.time() result = simulation.compute("size(parsing.csv(ds))", timeout=240, ds=ds) parseTimeEnd = time.time() self.assertTrue(result.isResult()) PerformanceTestReporter.recordTest( "python.BigBox.LargeS3.ParseTaxidata." + str(filecount), parseTimeEnd - parseTimeStart, None ) else: bytecount = bytecount.pyval PerformanceTestReporter.recordTest( "python.BigBox.LargeS3.TaxiSecondsPerGB." + str(filecount), (downloadTimeEnd - downloadTimeStart) / (bytecount / 1024 / 1024.0 / 1024.0), None ) finally: simulation.teardown()
def create_executor(self, **kwds): s3 = ActualS3Interface.ActualS3InterfaceFactory() if 'threadsPerWorker' not in kwds: kwds['threadsPerWorker'] = 30 if 'memoryPerWorkerMB' not in kwds: kwds['memoryPerWorkerMB'] = 40000 return InMemorySimulationExecutorFactory.create_executor(s3Service=s3, **kwds)
def createService(args): callbackSchedulerFactory = CallbackScheduler.createSimpleCallbackSchedulerFactory() callbackScheduler = callbackSchedulerFactory.createScheduler('ufora-worker', 1) channelListener = MultiChannelListener(callbackScheduler, [args.base_port, args.base_port + 1]) sharedStateViewFactory = ViewFactory.ViewFactory.TcpViewFactory( callbackSchedulerFactory.createScheduler('SharedState', 1), args.manager_address, int(args.manager_port) ) channelFactory = TcpChannelFactory.TcpStringChannelFactory(callbackScheduler) diagnostics_dir = os.getenv("UFORA_WORKER_DIAGNOSTICS_DIR") eventHandler = diagnostics_dir and createEventHandler( diagnostics_dir, callbackSchedulerFactory.createScheduler("ufora-worker-event-handler", 1) ) own_address = args.own_address or get_own_ip() print "Listening on", own_address, "ports:", args.base_port, "and", args.base_port+1 config = Setup.config() print "RAM cache of %d / %d MB and %d threads. Track tcmalloc: %s" % ( config.cumulusVectorRamCacheMB, config.cumulusMaxRamCacheMB, config.cumulusServiceThreadCount, config.cumulusTrackTcmalloc ) print "Ufora store at %s:%s" % (args.manager_address, args.manager_port) s3InterfaceFactory = ActualS3Interface.ActualS3InterfaceFactory() print "PythonIoTasks threads: %d. Out of process: %s" % ( config.externalDatasetLoaderServiceThreads, s3InterfaceFactory.isCompatibleWithOutOfProcessDownloadPool ) return CumulusService.CumulusService( own_address, channelListener, channelFactory, eventHandler, callbackScheduler, diagnostics_dir, Setup.config(), viewFactory=sharedStateViewFactory, s3InterfaceFactory=s3InterfaceFactory, objectStore=NullObjectStore.NullObjectStore() )
def test_disk_scans(self): s3 = ActualS3Interface.ActualS3InterfaceFactory() objectStore = S3ObjectStore.S3ObjectStore( s3, Setup.config().userDataS3Bucket, prefix="test_object_cache/") _, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers( "1+1", s3, 1, memoryLimitMb=1 * 1024, threadCount=30, returnSimulation=True, ioTaskThreadOverride=8, objectStore=objectStore, useInMemoryCache=False #use an actual disk cache for this ) try: gigabytes = 8 t0 = time.time() resultVectors = [] for ix in range(gigabytes): result = simulation.compute("Vector.range(125000000 + %s)" % ix, timeout=120) resultVectors.append(result.asResult.result) t1 = time.time() intResults = [] for vec in resultVectors: result = simulation.compute("v.sum()", timeout=120, v=vec) intResults.append(result.asResult.result.pyval) self.assertTrue(len(intResults) == gigabytes) PerformanceTestReporter.recordTest("python.BigBox.Disk.Write.10GB", t1 - t0, None) PerformanceTestReporter.recordTest( "python.BigBox.Disk.WriteAndScan.%sGB" % gigabytes, time.time() - t0, None) finally: simulation.teardown()
def writeToS3Test(self, bytecount, pageSizeOverride=1024*1024, workers=1, memoryLimitMb=45 * 1024,threadCount=30): text = """Vector.range(__bytecount__, {UInt8(_%100)}).paged""" s3 = ActualS3Interface.ActualS3InterfaceFactory() keyGuid = "bigbox-test-key-" + str(uuid.uuid4()) try: setupText = text.replace('__bytecount__', str(bytecount)) setupResults, simulation = self.computeUsingSeveralWorkers( setupText, s3, workers, memoryLimitMb=memoryLimitMb, threadCount=threadCount, returnSimulation=True, ioTaskThreadOverride=8, useInMemoryCache=False, timeout=30, objectStore=self.createObjectStore(s3), pageSizeOverride=pageSizeOverride ) result = simulation.executeExternalIoTask( CumulusNative.ExternalIoTask.WriteCharBigvecToS3( setupResults.asResult.result.getVectorBigvecGuid(), CumulusNative.S3KeyAndCredentials( self.getTestDataBucket(), keyGuid, "", "", "" ) ), timeout=60 ) self.assertTrue(result.isSuccess(), result) assert s3().getKeySize(self.getTestDataBucket(), keyGuid) == bytecount finally: try: s3().deleteKey(self.getTestDataBucket(), keyGuid) except: logging.warn("Failed to cleanup the test key: %s", traceback.format_exc())
def getCurrentS3Interface(): return ActualS3Interface.ActualS3InterfaceFactory()