def invoke(self, env: Environment, replica: FunctionReplica, request: FunctionRequest): # you would probably either create one simulator per function, or use a generalized simulator, this is just # to demonstrate how the simulators are used to encapsulate simulator behavior. logger.info('[simtime=%.2f] invoking function %s on node %s', env.now, request, replica.node.name) # for full flexibility you decide the resources used cpu_millis = replica.node.capacity.cpu_millis * 0.1 env.resource_state.put_resource(replica, 'cpu', cpu_millis) node = replica.node node.current_requests.add(request) if replica.function.name == 'python-pi': if replica.node.name.startswith('rpi3'): # those are nodes we created in basic.example_topology() yield env.timeout(20) # invoking this function takes 20 seconds on a raspberry pi else: yield env.timeout(2) # invoking this function takes 2 seconds on all other nodes in the cluster elif replica.function.name == 'resnet50-inference': yield env.timeout(0.5) # invoking this function takes 500 ms else: yield env.timeout(0) # also, you have to release them at the end env.resource_state.remove_resource(replica, 'cpu', cpu_millis) node.current_requests.remove(request)
def faas_idler(env: Environment, inactivity_duration=300, reconcile_interval=30): """ https://github.com/openfaas-incubator/faas-idler https://github.com/openfaas-incubator/faas-idler/blob/master/main.go default values: https://github.com/openfaas-incubator/faas-idler/blob/668991c532156275993399ee79a297a4c2d651ec/docker-compose.yml :param env: the faas environment :param inactivity_duration: i.e. 15m (Golang duration) :param reconcile_interval: i.e. 1m (default value) :return: an event generator """ faas: FaasSystem = env.faas while True: yield env.timeout(reconcile_interval) for deployment in faas.get_deployments(): if not deployment.scale_zero: continue for function in deployment.function_definitions.values(): if len(faas.get_replicas(function.name, FunctionState.RUNNING)) == 0: continue idle_time = env.now - env.metrics.last_invocation[ function.name] if idle_time >= inactivity_duration: env.process(faas.suspend(function.name)) logger.debug('%.2f function %s has been idle for %.2fs', env.now, function.name, idle_time)
def run(self, env: Environment): # deploy functions deployments = self.prepare_deployments() for deployment in deployments: yield from env.faas.deploy(deployment) # block until replicas become available (scheduling has finished and replicas have been deployed on the node) logger.info('waiting for replica') yield env.process(env.faas.poll_available_replica('python-pi')) yield env.process(env.faas.poll_available_replica('resnet50-inference')) # run workload ps = [] # execute 10 requests in parallel logger.info('executing 10 python-pi requests') for i in range(10): ps.append(env.process(env.faas.invoke(FunctionRequest('python-pi')))) logger.info('executing 10 resnet50-inference requests') for i in range(10): ps.append(env.process(env.faas.invoke(FunctionRequest('resnet50-inference')))) # wait for invocation processes to finish for p in ps: yield p
def execute(self, env: Environment, replica: FunctionReplica, request: FunctionRequest): # mock download, for actual network download simulation look at simulate_data_download yield env.timeout(1) # training yield env.timeout(5) # mock upload yield env.timeout(1)
def solve(self, env: Environment) -> Generator[simpy.events.Event, Any, Any]: while self.running: yield env.timeout(self.reconcile_interval) yield from self.solver.solve(env) # TODO remove when contention is implemented self.stop()
def simulate_data_upload(env: Environment, replica: FunctionReplica): node = replica.node.ether_node func = replica started = env.now if 'data.skippy.io/sends-to-storage' not in func.pod.spec.labels: return # FIXME: storage size = parse_size_string( func.pod.spec.labels['data.skippy.io/sends-to-storage']) path = func.pod.spec.labels['data.skippy.io/sends-to-storage/path'] storage_node_name = env.cluster.get_storage_nodes(path)[0] logger.debug('%.2f replica %s uploading data %s to %s', env.now, node, path, storage_node_name) if storage_node_name == node.name: # FIXME this is essentially a disk read and not a network connection yield env.timeout(size / 1.25e+8) # 1.25e+8 = 1 GBit/s return storage_node = env.cluster.get_node(storage_node_name) route = env.topology.route_by_node_name(node.name, storage_node.name) flow = SafeFlow(env, size, route) yield flow.start() for hop in route.hops: env.metrics.log_network(size, 'data_upload', hop) env.metrics.log_flow(size, env.now - started, route.source, route.destination, 'data_upload')
def teardown(self, env: Environment, replica: FunctionReplica): # basic cpu usage, in % env.resource_state.remove_resource(replica, 'cpu', 0.08) # basic memory consumption, in MB env.resource_state.remove_resource(replica, 'memory', 200) yield env.timeout(0)
def startup(self, env: Environment, replica: FunctionReplica): logger.info( '[simtime=%.2f] starting up function replica for function %s', env.now, replica.function.name) # you could create a very fine-grained setup routines here yield env.timeout(10) # simulate docker startup
def invoke(self, env: Environment, replica: FunctionReplica, request: FunctionRequest): token = self.queue.request() t_wait_start = env.now yield token # wait for access t_wait_end = env.now t_fet_start = env.now # because of GIL and Threads, we can easily estimate the additional time caused by concurrent requests to the # same Function factor = max(1, self.scale(self.queue.count, self.queue.capacity)) try: fet = self.deployment.sample_fet(replica.node.name) if fet is None: logging.error( f"FET for node {replica.node.name} for function {self.deployment.image} was not found" ) raise ValueError(f'{replica.node.name}') fet = float(fet) * factor image = replica.pod.spec.containers[0].image if 'preprocessing' in image or 'training' in image: yield from simulate_data_download(env, replica) start = env.now # replica.node.current_requests.add(request) call = FunctionCall(request, replica, start) replica.node.all_requests.append(call) yield env.timeout(fet) # add degradation end = env.now degradation = replica.node.estimate_degradation(start, end) delay = max(0, (fet * degradation) - fet) yield env.timeout(delay) if 'preprocessing' in image or 'training' in image: yield from simulate_data_upload(env, replica) t_fet_end = env.now env.metrics.log_fet(request.name, replica.function.image, replica.node.name, t_fet_start, t_fet_end, t_wait_start, t_wait_end, degradation, id(replica)) replica.node.set_end(request.request_id, end + delay) # replica.node.current_requests.remove(request) except KeyError: pass self.queue.release(token)
def invoke(self, env: Environment, replica: FunctionReplica, request: FunctionRequest): # 1) get parameters of base distribution (ideal case) # 2) check the utilization of the node the replica is running on # 3) transform distribution parameters with degradation function depending on utilization # 4) sample from that distribution logger.info('invoking %s on %s (%d in parallel)', request.name, replica.node.name, len(replica.node.current_requests)) yield env.timeout(1)
def __init__(self, topology: Topology, benchmark: Benchmark, env: Environment = None, timeout=None, name=None): self.env = env or Environment() self.topology = topology self.benchmark = benchmark self.timeout = timeout self.name = name
def run(self, env: Environment): for deployment in self.deployments: yield from env.faas.deploy(deployment) for deployment in self.deployments: yield env.process(env.faas.poll_available_replica(deployment.name)) ps = [] logging.info('executing requests') for deployment in self.deployments: try: ia_generator = self.arrival_profiles[deployment.name] if self.duration is None: p = env.process(function_trigger(env, deployment, ia_generator, max_requests=1000)) else: p = env.process(function_trigger(env, deployment, ia_generator)) ps.append(p) except KeyError: logging.warning('no arrival profile for deployment %s', deployment.name) if self.duration is not None: env.process(self.wait(env, ps)) yield from ps
def invoke(self, env: Environment, replica: FunctionReplica, request: FunctionRequest): # you would probably either create one simulator per function, or use a generalized simulator, this is just # to demonstrate how the simulators are used to encapsulate simulator behavior. logger.info('[simtime=%.2f] invoking function %s on node %s', env.now, request, replica.node.name) if replica.function.name == 'python-pi': if replica.node.name.startswith( 'rpi3' ): # those are nodes we created in basic.example_topology() yield env.timeout( 20 ) # invoking this function takes 20 seconds on a raspberry pi else: yield env.timeout( 2 ) # invoking this function takes 2 seconds on all other nodes in the cluster elif replica.function.name == 'resnet50-inference': yield env.timeout(0.5) # invoking this function takes 500 ms else: yield env.timeout(0)
def pull(env: Environment, image_str: str, node: Node): """ Simulate a docker pull command of the given image on the given node. :param env: the simulation environment :param image_str: the name of the image (<repository[:tag]>) :param node: the node on which to run the pull command :return: a simpy process (a generator) """ started = env.now # TODO: there's a lot of potential to improve fidelity here: consider image layers, simulate extraction time, etc. # e.g., docker pull on a 13MB container takes about 5 seconds. the simulated time at 120 MBit/sec would be <1s # find the image in the registry with the node's architecture images = env.container_registry.find(image_str, arch=node.arch) if not images: raise ValueError('image not in registry: %s arch=%s' % (image_str, node.arch)) image = images[0] node_state = env.get_node_state(node.name) if node_state: if image in node_state.docker_images: return else: node_state.docker_images.add(image) size = image.size if size <= 0: return # # FIXME: crude simulation of layer sharing (90% across images is shared) # num_images = len(env.cluster.images_on_nodes[node.name]) - 1 # if num_images > 0: # size = size * 0.1 route = env.topology.route(DockerRegistry, node) flow = SafeFlow(env, size, route) yield flow.start() # for hop in route.hops: # env.metrics.log_network(size, 'docker_pull', hop) env.metrics.log_flow(size, env.now - started, route.source, route.destination, 'docker_pull')
def run(self, env: Environment): # deploy functions deployments = self.prepare_deployments() for deployment in deployments: yield from env.faas.deploy(deployment) # block until replicas become available (scheduling has finished and replicas have been deployed on the node) logger.info('waiting for replica') yield env.process(env.faas.poll_available_replica('python-pi')) # generate profile ia_generator = expovariate_arrival_profile( constant_rps_profile(rps=20)) # run profile yield from function_trigger(env, deployments[0], ia_generator, max_requests=100)
def invoke(self, env: Environment, replica: FunctionReplica, request: FunctionRequest): token = self.queue.request() yield token # wait for access # because of GIL and Threads, we can easily estimate the additional time caused by concurrent requests to the # same Function factor = max(1, self.scale(self.queue.count, self.queue.capacity)) try: fet = self.deployment.sample_fet(replica.node.name) if fet is None: logging.error( f"FET for node {replica.node.name} for function {self.deployment.image} was not found" ) raise ValueError(f'{replica.node.name}') fet = float(fet) * factor yield env.timeout(fet) except KeyError: pass self.queue.release(token)
def solve(self, env: Environment) -> Generator[simpy.events.Event, Any, Any]: logging.info('Calculating Pod Labels') start = time.time() if self.clusters is None or len(self.clusters) == 0: # TODO caching because this may bottleneck - needs to figure out if clusters/devices have changed self.clusters: Dict[str, Cluster] = create_clusters(env) self.devices = get_devices(env) self.state = State(self.devices, self.clusters) results = [] if self.settings.parallel: self.execute_ga_parallel(results) else: self.execute_ga_single_threaded(results) for result in results: set_reqs_for_cluster(result.instance.cluster, result.requirements, env) end = time.time() logging.info("Done calculating pods") yield env.timeout(end - start)
def function_trigger(env: Environment, deployment: FunctionDeployment, ia_generator, max_requests=None): try: if max_requests is None: while True: ia = next(ia_generator) yield env.timeout(ia) env.process(env.faas.invoke(FunctionRequest(deployment.name))) else: for _ in range(max_requests): ia = next(ia_generator) yield env.timeout(ia) env.process(env.faas.invoke(FunctionRequest(deployment.name))) except simpy.Interrupt: pass except StopIteration: logging.error(f'{deployment.name} gen has finished')
def deploy(self, env: Environment, replica: FunctionReplica): yield env.timeout(0)
sched_params = { 'percentage_of_nodes_to_score': 100, 'priorities': priorities, 'predicates': predicates } # Set arrival profiles/workload pattern benchmark = ConstantBenchmark('mixed', duration=200, rps=50) # Initialize topology storage_index = StorageIndex() topology = urban_sensing_topology(ether_nodes, storage_index) # Initialize environment env = Environment() env.simulator_factory = AIPythonHTTPSimulatorFactory( get_raith21_function_characterizations(resource_oracle, fet_oracle)) env.metrics = Metrics(env, log=RuntimeLogger(SimulatedClock(env))) env.topology = topology env.faas = DefaultFaasSystem(env, scale_by_requests=True) env.container_registry = ContainerRegistry() env.storage_index = storage_index env.cluster = SimulationClusterContext(env) env.scheduler = Scheduler(env.cluster, **sched_params) sim = Simulation(env.topology, benchmark, env=env) result = sim.run() dfs = {
def setup(self, env: Environment, replica: FunctionReplica): # no setup routine yield env.timeout(0)
def release_resources(self, env: Environment, replica: FunctionReplica, request: FunctionRequest): env.resource_state.remove_resource(replica, 'cpu', 0.2) yield env.timeout(0)
def claim_resources(self, env: Environment, replica: FunctionReplica, request: FunctionRequest): # no setup time, no memory because everything is cached - only cpu usage env.resource_state.put_resource(replica, 'cpu', 0.2) yield env.timeout(0)
def execute(self, env: Environment, replica: FunctionReplica, request: FunctionRequest): yield env.timeout(0.2)
def solve(self, env: Environment) -> Generator[simpy.events.Event, Any, Any]: yield env.timeout(0)
def invoke(self, env: Environment, replica: FunctionReplica, request: FunctionRequest): yield env.timeout(0)
def startup(self, env: Environment, replica: FunctionReplica): yield env.timeout(0)
def teardown(self, env: Environment, replica: FunctionReplica): yield env.timeout(0)
def run(self, env: Environment): yield env.timeout(0)
def claim_resources(self, env: Environment, replica: FunctionReplica, request: FunctionRequest): env.resource_state.put_resource(replica, 'cpu', 0.7) env.resource_state.put_resource(replica, 'memory', 0.3) yield env.timeout(0)