def run(cloudburst: CloudburstConnection, num_requests: int, num_fns: int, data_size: str, do_optimize: bool): def fusion_op(self, row: Row) -> bytes: return row['data'] print(f'Creating flow with {num_fns} operators and {data_size}' + f' ({DATA_SIZES[data_size]}) inputs.') flow = Flow('fusion-benchmark', FlowType.PUSH, cloudburst) marker = flow for _ in range(num_fns): marker = marker.map(fusion_op, names=['data']) if do_optimize: flow = optimize(flow, rules=optimize_rules) print('Flow has been optimized...') flow.deploy() print('Flow successfully deployed!') latencies = [] inp = Table([('data', BtsType)]) inp.insert([os.urandom(DATA_SIZES[data_size])]) print('Starting benchmark...') for i in range(num_requests): if i % 100 == 0 and i > 0: print(f'On request {i}...') start = time.time() res = flow.run(inp).get() end = time.time() latencies.append(end - start) print_latency_stats(latencies, 'E2E')
def run(cloudburst: CloudburstConnection, num_requests: int, gamma: int, num_replicas: int): def stage1(self, val: int) -> int: return val + 1 def stage2(self, row: Row) -> float: import time from scipy.stats import gamma delay = gamma.rvs(3.0, scale=row['scale']) * 10 / 1000 # convert to ms time.sleep(delay) return delay def stage3(self, row: Row) -> float: return row['val'] print(f'Creating flow with {num_replicas} replicas and' + f' gamma={GAMMA_VALS[gamma]}') flow = Flow('fusion-benchmark', FlowType.PUSH, cloudburst) flow.map(stage1, col='val') \ .map(stage2, names=['val'], high_variance=True) \ .map(stage3, names=['val']) optimize_rules['compete_replicas'] = num_replicas flow = optimize(flow, rules=optimize_rules) print('Flow has been optimized...') flow.deploy() print('Flow successfully deployed!') latencies = [] inp = Table([('val', IntType), ('scale', FloatType)]) inp.insert([1, GAMMA_VALS[gamma]]) print('Starting benchmark...') for i in range(num_requests): if i % 100 == 0 and i > 0: print(f'On request {i}...') time.sleep(.300) # Sleep to let the queue drain. start = time.time() res = flow.run(inp).get() end = time.time() latencies.append(end - start) print_latency_stats(latencies, 'E2E')
def run(cloudburst: CloudburstConnection, num_requests: int, data_size: str, do_optimize: bool): def stage1(self, row: Row) -> bytes: import numpy as np return np.random.rand(row['size']) def stage2(self, row: Row) -> int: return 3 print(f'Creating flow with {data_size} ({DATA_SIZES[data_size]}) inputs.') flow = Flow('colocate-benchmark', FlowType.PUSH, cloudburst) f1 = flow.map(stage1) p1 = f1.map(stage2, names=['val1']) p2 = f1.map(stage2, names=['val2']) p3 = f1.map(stage2, names=['val3']) p4 = f1.map(stage2, names=['val4']) p5 = f1.map(stage2, names=['val5']) # p6 = f1.map(stage2, names=['val6']) # p7 = f1.map(stage2, names=['val7']) # p8 = f1.map(stage2, names=['val8']) p1.join(p2).join(p3).join(p4).join(p5) # .join(p6).join(p7).join(p8) if do_optimize: flow = optimize(flow, rules=optimize_rules) print('Flow has been optimized...') flow.deploy() print('Flow successfully deployed!') latencies = [] inp = Table([('size', IntType)]) inp.insert([DATA_SIZES[data_size]]) print('Starting benchmark...') for i in range(num_requests): if i % 100 == 0 and i > 0: print(f'On request {i}...') start = time.time() res = flow.run(inp).get() end = time.time() latencies.append(end - start) print_latency_stats(latencies, 'E2E')
def optimize(flow, rules: dict = DEFAULT_RULES): for key in DEFAULT_RULES: if key not in rules: rules[key] = False if rules['colocate'] and rules['breakpoint']: raise FlowError('Cannot enable the colocate and breakpoint rules' + ' together.') optimized = Flow(flow.flowname, flow.typ, flow.cloudburst, flow.source) if rules['whole']: cloned = optimize( flow, { 'fusion': False, 'compete': False, 'compete_replicas': 1, 'colocate': False, 'breakpoint': False, 'whole': False }) cloned.cloudburst = None # Remove sockets to serialize and send flow. queue = [cloned] gpu = False batching = [] while len(queue) > 0: op = queue.pop(0) op.cb_fn = None if type(op) != Flow: batching.append(op.batching) gpu = op.gpu if not gpu else gpu queue.extend(op.downstreams) if all(batching): cloned.batching = True optimized.multi([cloned], whole=True) multi_op = optimized.downstreams[0] multi_op.batching = all(batching) multi_op.gpu = gpu if gpu: multi_op.fn_name += '-gpu' return optimized ### OPERATOR FUSION ### queue = [] join_tracker = {} processed = set() for ds in flow.downstreams: queue.append((ds, optimized)) # NOTE: We clone the whole flow regardless. If fusion is turned on, # then we will fuse operators, and otherwise, we simply find chains, # throw them away, and add operators to the optimized flow. while len(queue) > 0: op, upstream = queue.pop(0) if op.fn_name in processed: continue chain = find_chain(op) if len(chain) == 0 or not rules['fusion']: downstreams = op.downstreams processed.add(op.fn_name) if type(op) == MapOperator: marker = upstream.map(op.fn, op.col, op.names, op.logic.preprocess, op.high_variance, op.gpu, op.batching, op.multi) if type(op) == FilterOperator: marker = upstream.filter(op.fn, op.group, op.logic.preprocess) if type(op) == GroupbyOperator: marker = upstream.gropuby(op.groupby_key, op.logic.preprocess) if type(op) == CombineOperator: marker = upstream.combine() if type(op) == LookupOperator: # Merge lookup operators with their successors. downstreams = [] for ds in op.downstreams: if isinstance(ds, MultiOperator): ops = [op] + ds.ops else: ops = [op, ds] marker = upstream.multi(ops) for next_ds in ds.downstreams: queue.append((next_ds, marker)) if type(op) == AggOperator: marker = upstream.agg(op.aggregate, op.column) if type(op) == MultiOperator: # This will only happen in the case where the previous operator # was a LookupHelperOperator combined with something else. marker = upstream.multi(op.ops) if type(op) == JoinOperator: if op.fn_name not in join_tracker: join_tracker[op.fn_name] = upstream downstreams = [] processed.discard(op.fn_name) else: other = join_tracker[op.fn_name] marker = other.join(upstream, op.on, op.how, op.logic.preprocess) else: marker = upstream.multi(chain) downstreams = chain[-1].downstreams for op in chain: # Set the multi operator to have various properties. if op.high_variance: optimized.operators[marker.position].high_variance = True if op.gpu: optimized.operators[marker.position].gpu = True # Hack for autoscaling... optimized.operators[marker.position].fn_name += '-gpu' if op.batching: optimized.operators[marker.position].batching = True if optimized.operators[marker.position].batching: for old in chain: if not old.batching: print('Cannot create a fused operator with' + ' batching enabled if all operators do' + ' not batch.') optimized.operators[marker.position].batching = False for ds in downstreams: queue.append((ds, marker)) ### LOCALITY BREAKPOINTS ### if rules['breakpoint']: queue = [optimized] processed = set() while len(queue) > 0: op = queue.pop(0) if op.fn_name in processed: continue # We only set breakpoints if we are in a linear chain portion of the # flow. This will only be true if there is only one operator in the # queue at a time. After pop, the length should be 0 until we add this # op's downstreams. if len(queue) == 0: if isinstance(op, LookupOperator): op.breakpoint = True if isinstance(op, MultiOperator): for sub in op.ops: if isinstance(sub, LookupOperator): op.breakpoint = True processed.add(op.fn_name) queue.extend(op.downstreams) ### COMPETITIVE EXECUTION ### if rules['compete']: new_ops = [] for operator in optimized.operators.values(): if operator.high_variance: for downstream in operator.downstreams: if len(downstream.upstreams) > 1: raise RuntimeError("Cannot have a competitive" + " execution map feed into an " + "operator with multiple upstreams.") downstream.multi_exec = True for _ in range(rules['compete_replicas']): # Create a new operator that is an exact replica. if isinstance(operator, MapOperator): new_op = MapOperator(operator.fn, operator.fntype, operator.flowname, operator.col, operator.names, operator.logic.preprocess, operator.high_variance, operator.gpu, operator.batching, operator.multi, optimized.sink) if isinstance(operator, MultiOperator): new_op = MultiOperator(operator.ops, operator.flowname, optimized.sink) # Hook it into the DAG by updating all up/downstreams. new_op.downstreams = list(operator.downstreams) new_op.upstreams = list(operator.upstreams) for op in new_op.downstreams: op.upstreams.append(new_op) for op in new_op.upstreams: op.downstreams.append(new_op) new_ops.append(new_op) for new_op in new_ops: optimized.operators[str(uuid.uuid4())] = new_op if rules['colocate']: curr_op = optimized while len(curr_op.downstreams) > 0: if len(curr_op.downstreams) == 1: curr_op = curr_op.downstreams[0] else: # We only support one colocation for now. if not curr_op.supports_broadcast: raise RuntimeError('Unsupported broadcast attempt.') colocates = list( map(lambda op: op.fn_name, curr_op.downstreams)) optimized.colocates = colocates for op in curr_op.downstreams: if not curr_op.supports_broadcast: raise RuntimeError('Unsupported broadcast attempt.') args = list(op.init_args) args[1] = True # Receive broadcast. op.init_args = tuple(args) args = list(curr_op.init_args) args[0] = True # Send broadcast. curr_op.init_args = tuple(args) break return optimized
incept = inceptionv3_model_gpu incept_cons = inceptionv3_init_gpu trans = transform_batch else: resnet = resnet_model resnet_cons = resnet_init incept = inceptionv3_model incept_cons = inceptionv3_init trans = transform with open('imagenet_classes.txt', 'r') as f: classes = [line.strip() for line in f.readlines()] cloudburst.put_object('imagenet-classes', classes) flow = Flow('cascade-flow', FlowType.PUSH, cloudburst) rnet = flow.map(trans, init=transform_init, names=['img'], batching=gpu) \ .map(resnet, init=resnet_cons, names=['img', 'resnet_index', 'resnet_max_prob'], gpu=gpu, batching=gpu) incept = rnet.filter(low_prob) \ .map(incept, init=incept_cons, names=['incept_index', 'incept_max_prob'], gpu=gpu,
indices = np.argsort(all_percentages)[::-1] return classes[indices[0]] import base64 import sys from cloudburst.client.client import CloudburstConnection table = Table([('img', StrType)]) img = base64.b64encode(open('panda.jpg', "rb").read()).decode('ascii') table.insert([img]) cloudburst = CloudburstConnection(sys.argv[1], '3.226.122.35') flow = Flow('ensemble-flow', FlowType.PUSH, cloudburst) img = flow.map(transform, init=transform_init, names=['img']) anet = img.map(alexnet_model, init=alexnet_init, names=['alexnet_index', 'alexnet_perc']) rnet = img.map(resnet_model, init=resnet_init, names=['resnet_index', 'resnet_perc']) anet.join(rnet).map(ensemble_predict, names=['class']) flow.deploy() from cloudburst.server.benchmarks.utils import print_latency_stats import time print('Starting benchmark...') latencies = [] for _ in range(100):
cloudburst.list() import random import string salt = "".join(random.choices(string.ascii_letters, k=6)) print("Running sanity check") cloud_sq = cloudburst.register(lambda _, x: x * x, "square-2"+salt) print(cloud_sq(2).get()) cloudburst.delete_dag("dag") cloudburst.register_dag("dag", ["square-2"+salt], []) print(cloudburst.call_dag("dag", {"square-2"+salt: [2]}).get()) # 1 / 0 print("Running example flow") dataflow = Flow("example-flow"+salt, FlowType.PUSH, cloudburst) dataflow.map(map_fn, names=["sum"]).filter(filter_fn) table = Table([("a", IntType), ("b", IntType)]) table.insert([1, 2]) table.insert([1, 3]) table.insert([1, 4]) dataflow.register() dataflow.deploy() print(dataflow) print("deployed") print(dataflow.run(table).get())
def run(cloudburst: CloudburstConnection, num_requests: int, batch_size: int, gpu: bool): with open('imagenet_classes.txt', 'r') as f: classes = [line.strip() for line in f.readlines()] cloudburst.put_object('imagenet-classes', classes) def resnet_init_gpu(self, cloudburst): import os import torch import torchvision from torchvision import transforms tpath = os.path.join(os.getenv('TORCH_HOME'), 'checkpoints') self.resnet = torch.load(os.path.join(tpath, 'resnet101.model')).cuda() self.resnet.eval() self.transforms = transforms.Compose([ transforms.ToPILImage(), transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) self.classes = cloudburst.get('imagenet-classes') def resnet_model_gpu(self, table: Table) -> str: """ AlexNet for image classification on ImageNet """ import torch inputs = [] for row in table.get(): img = self.transforms(row['img']) inputs.append(img) inputs = torch.stack(inputs, dim=0).cuda() output = self.resnet(inputs) _, indices = torch.sort(output, descending=True) indices = indices.cpu().detach().numpy() result = [] for idx_set in indices: index = idx_set[0] result.append(self.classes[index]) return result def resnet_init_cpu(self, cloudburst): import os import torch import torchvision from torchvision import transforms tpath = os.path.join(os.getenv('TORCH_HOME'), 'checkpoints') self.resnet = torch.load(os.path.join(tpath, 'resnet101.model')) self.resnet.eval() self.transforms = transforms.Compose([ transforms.ToPILImage(), transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) self.classes = cloudburst.get('imagenet-classes') def resnet_model_cpu(self, table: Table) -> str: """ AlexNet for image classification on ImageNet """ import torch inputs = [] for row in table.get(): img = self.transforms(row['img']) inputs.append(img) inputs = torch.stack(inputs, dim=0) output = self.resnet(inputs) _, indices = torch.sort(output, descending=True) indices = indices.detach().numpy() result = [] for idx_set in indices: index = idx_set[0] result.append(self.classes[index]) return result print(f'Creating flow with size {batch_size} batches.') flow = Flow('batching-benchmark', FlowType.PUSH, cloudburst) if gpu: flow.map(resnet_model_gpu, init=resnet_init_gpu, names=['class'], gpu=True, batching=True) else: flow.map(resnet_model_cpu, init=resnet_init_cpu, names=['class'], batching=True) flow.deploy() print('Flow successfully deployed!') latencies = [] inp = Table([('img', NumpyType)]) img = np.array(Image.open('panda.jpg').convert('RGB').resize((224, 224))) inp.insert([img]) kvs = cloudburst.kvs_client if gpu: print('Starting GPU warmup...') for _ in range(50): flow.run(inp).get() print('Finished warmup...') print('Starting benchmark...') for i in range(num_requests): if i % 100 == 0 and i > 0: print(f'On request {i}...') futs = [] for _ in range(batch_size): futs.append(flow.run(inp)) pending = set([fut.obj_id for fut in futs]) # Break these apart to batch the KVS get requests. start = time.time() while len(pending) > 0: get_start = time.time() response = kvs.get(list(pending)) for key in response: if response[key] is not None: pending.discard(key) end = time.time() latencies.append(end - start) compute_time = np.mean(latencies) * num_requests tput = (batch_size * num_requests) / (compute_time) print('THROUGHPUT: %.2f' % (tput)) print_latency_stats(latencies, 'E2E')
type=str, metavar='O', help='The name of the file with the benchmark IPs', dest='benchmarks', required=True) args = parser.parse_args() benchmark_ips = [] with open(args.benchmarks[0], 'r') as f: benchmark_ips = f.readlines() cloudburst = CloudburstConnection(args.cloudburst[0], args.ip[0]) print('Successfully connected to Cloudburst') flow = Flow('scaling-benchmark', FlowType.PUSH, cloudburst) flow.map(stage1, names=['val']).map(stage2, names=['val']) table = Table([('val', IntType)]) table.insert([1]) num_bench = len(benchmark_ips) num_start = int(start_percent * num_bench) flow.cloudburst = None # Hack to serialize and send flow. queue = [flow] while len(queue) > 0: op = queue.pop(0) op.cb_fn = None
dest='threads', required=True) parser.add_argument('-l', '--local', nargs=1, type=str, metavar='L', help='Whether to run in local mode (required)', dest='local', required=True) args = parser.parse_args() print('Connecting to Cloudburst...') cloudburst = CloudburstConnection(args.cloudburst[0], args.ip[0]) flow = Flow('recsys-flow', FlowType.PUSH, cloudburst) flow.lookup('user', dynamic=True) \ .map(pick_category, names=['user', 'weights', 'category']) \ .lookup('category', dynamic=True) \ .map(get_topk, names=['1', '2', '3', '4', '5']) flow = optimize(flow, rules=optimize_rules) print('Creating data...') # for i in range(NUM_USERS): # if i % 10000 == 0: # print(f'On user {i}...') # user_vector = np.random.randn(512) # cloudburst.put_object(str(i), user_vector)
def run(cloudburst: CloudburstConnection, num_requests: int, data_size: str, breakpoint: bool, do_optimize: bool): print('Creating data...') size = DATA_SIZES[data_size] for i in range(1, NUM_DATA_POINTS+1): arr = np.random.rand(size) cloudburst.put_object('data-' + str(i), arr) def stage1(self, row: Row) -> (int, str): idx = int(row['req_num'] / 10) + 1 key = 'data-%d' % (idx) return idx, key def stage2(self, row: Row) -> str: import numpy as np arr = row[row['key']] return float(np.sum(arr)) print(f'Creating flow with {data_size} ({DATA_SIZES[data_size]}) inputs.') flow = Flow('locality-benchmark', FlowType.PUSH, cloudburst) flow.map(stage1, names=['index', 'key']) \ .lookup('key', dynamic=True) \ .map(stage2, names=['sum']) optimize_rules['breakpoint'] = breakpoint if do_optimize: flow = optimize(flow, rules=optimize_rules) print('Flow has been optimized...') flow.deploy() print('Flow successfully deployed!') latencies = [] inp = Table([('req_num', IntType)]) if breakpoint: print('Starting warmup...') for i in range(NUM_DATA_POINTS): inp = Table([('req_num', IntType)]) inp.insert([i * 10]) res = flow.run(inp).get() print('Pausing to let cache metadata propagate...') time.sleep(15) print('Starting benchmark...') for i in range(num_requests): if i % 100 == 0 and i > 0: print(f'On request {i}...') inp = Table([('req_num', IntType)]) inp.insert([i]) start = time.time() res = flow.run(inp).get() end = time.time() latencies.append(end - start) with open('data.bts', 'wb') as f: from cloudburst.shared.serializer import Serializer ser = Serializer() bts = ser.dump(latencies) f.write(bts) print_latency_stats(latencies, 'E2E')
german_init = english_to_german_init_gpu french_init = english_to_french_init_gpu german = english_to_german_gpu french = english_to_french_gpu else: german_init = english_to_german_init french_init = english_to_french_init german = english_to_german french = english_to_french with open('imagenet_classes.txt', 'r') as f: classes = [line.strip() for line in f.readlines()] cloudburst.put_object('imagenet-classes', classes) flow = Flow('nmt-flow', FlowType.PUSH, cloudburst) classified = flow.map(classify_language, init=classify_language_init, names=['language', 'translate'], batching=True) french = classified.filter(filter_french) \ .map(french, init=french_init, names=['french'], gpu=gpu, high_variance=True, batching=gpu) \ .filter(true_filter) german = classified.filter(filter_german) \