def __update_workload(self, func, arg, workload_id): try: with self.__lock: log.debug("Acquired lock for func: {} on workload: {}".format( func.__name__, workload_id)) start_time = time.time() func(arg) stop_time = time.time() if self.__reg is not None: self.__reg.distribution_summary( self.__get_workload_processing_metric_name( func.__name__), self.__tags).record(stop_time - start_time) self.__reg.distribution_summary( WORKLOAD_PROCESSING_DURATION, self.__tags).record(stop_time - start_time) log.debug("Released lock for func: {} on workload: {}".format( func.__name__, workload_id)) return True except: self.__error_count += 1 log.exception("Failed to execute func: {} on workload: {}".format( func.__name__, workload_id)) return False
def __process_events(self): while not self.__stopped: try: event = self.__q.get(timeout=self.__event_timeout) dequeue_time = time.time() log.info("Dequeued event: {}, queue depth: {}".format(event[ACTION], self.get_queue_depth())) if self.__reg is not None: self.__reg.counter(DEQUEUED_COUNT_KEY, self.__tags).increment() self.__reg.counter(self.__get_dequeued_metric_name(event), self.__tags).increment() self.__reg.distribution_summary(QUEUE_LATENCY_KEY, self.__tags).record(dequeue_time - event[ENQUEUE_TIME_KEY]) except Empty: log.debug("Timed out waiting for event on queue.") continue for event_handler in self.__event_handlers: try: log.info("{} handling event: {}".format(type(event_handler).__name__, event[ACTION])) event_handler.handle(event) self.__report_succeeded_event(event_handler) except: log.exception("Event handler: '{}' failed to handle event: '{}'".format( type(event_handler).__name__, event)) self.__report_failed_event(event_handler) self.__q.task_done() self.__reg.counter(EVENT_PROCESSED_KEY, self.__tags).increment() self.__reg.gauge(QUEUE_DEPTH_KEY, self.__tags).set(self.get_queue_depth()) self.__processed_count += 1
def get_cell_name(): config_manager = get_config_manager() if config_manager is None: log.warning("Config manager is not yet set.") return UNKNOWN_CELL url = config_manager.get_str(REMOTE_ALLOCATOR_URL) if url is None: log.warning("No remote solver URL specified.") return UNKNOWN_CELL timeout = config_manager.get_int(MAX_SOLVER_RUNTIME, DEFAULT_MAX_SOLVER_RUNTIME) try: response = requests.get(url, timeout=timeout) cell_name = response.headers.get(TITUS_ISOLATE_CELL_HEADER, None) if cell_name is None: log.warning("Titus isolation cell header is not set.") return UNKNOWN_CELL else: return cell_name except: log.exception("Failed to determine isolation cell.") return UNKNOWN_CELL
def __sample_monitors(self): with self.__lock: for workload_id, monitor in self.__monitors.items(): try: Thread(target=monitor.sample).start() except: log.exception("Failed to sample performance of workload: '{}'".format(workload_id))
def get_name(self) -> str: url = "{}/cpu_allocator".format(self.__url) try: response = requests.get(url, timeout=self.__timeout) return "Remote({})".format(response.text) except: log.exception("Failed to GET cpu allocator name.") return UNKNOWN_CPU_ALLOCATOR
def __report_metrics(self): try: tags = self.get_tags() for reporter in self.__reporters: reporter.report_metrics(tags) except Exception: log.exception("Failed to report metrics.")
def report_event(self, payload: dict): try: payload['ts'] = str(datetime.datetime.utcnow()) event = {"uuid": str(uuid.uuid4()), "payload": payload} msg = get_event_msg(event) self.__q.put_nowait(msg) except: self.__failed_msg_count += 1 log.exception( "Failed to report event for payload: {}".format(payload))
def __get_cpuset(self, container_name: str) -> str: try: self.__wait_for_files(container_name) log.info("getting cpuset.cpus for workload: '{}'".format( container_name)) return get_cpuset(container_name) except: log.exception("Failed to read cpuset for workload: '{}'".format( container_name)) return None
def rebalance(self, request: AllocateRequest) -> AllocateResponse: try: self.__primary_rebalance_call_count += 1 return self.__primary_allocator.rebalance(request) except: log.exception( "Failed to rebalance workloads: '{}' with primary allocator: '{}', falling back to: '{}'".format( [w.get_id() for w in request.get_workloads().values()], self.__primary_allocator.__class__.__name__, self.__secondary_allocator.__class__.__name__)) self.__secondary_rebalance_call_count += 1 return self.__secondary_allocator.rebalance(request)
def free_threads(self, request: AllocateThreadsRequest) -> AllocateResponse: try: self.__primary_free_threads_call_count += 1 return self.__primary_allocator.free_threads(request) except: log.exception( "Failed to free threads for workload: '{}' with primary allocator: '{}', falling back to: '{}'".format( request.get_workload_id(), self.__primary_allocator.__class__.__name__, self.__secondary_allocator.__class__.__name__)) self.__secondary_free_threads_call_count += 1 return self.__secondary_allocator.free_threads(request)
def __update_burst_cpusets(self): empty_thread_ids = self.__get_empty_thread_ids() error_count = 0 for b_w in get_burst_workloads(self.__workloads.values()): log.info("updating burst workload: '{}'".format(b_w.get_id())) try: self.__cgroup_manager.set_cpuset(b_w.get_id(), empty_thread_ids) except: error_count += 1 log.exception("Failed to update burst workload: '{}', maybe it's gone.".format(b_w.get_id())) self.__error_count += error_count
def __schedule_loop(exit_handler: ExitHandler): log.info("Starting scheduling loop...") while True: try: sleep_time = _schedule_once(exit_handler) _notify_watchdog() log.debug("Scheduling thread sleeping for: '%d' seconds", sleep_time) time.sleep(sleep_time) except: log.exception("Failed to run scheduling loop") exit_handler.exit(SCHEDULING_LOOP_FAILURE_EXIT_CODE)
def get_current_workloads(docker_client): workloads = [] for container in docker_client.containers.list(): try: if is_kubernetes(): workloads.append(get_workload_from_kubernetes(container.name)) else: workloads.append(get_workload_from_disk(container.name)) except: log.exception("Failed to read environment for container: '%s'", container.name) return workloads
def report_metrics(self, tags): log.debug("Reporting metrics") try: # Workload manager metrics self.__reg.gauge(RUNNING, tags).set(1) self.__reg.gauge(ADDED_KEY, tags).set( self.__workload_manager.get_added_count()) self.__reg.gauge(REMOVED_KEY, tags).set( self.__workload_manager.get_removed_count()) self.__reg.gauge(SUCCEEDED_KEY, tags).set( self.__workload_manager.get_success_count()) self.__reg.gauge(FAILED_KEY, tags).set( self.__workload_manager.get_error_count()) self.__reg.gauge(WORKLOAD_COUNT_KEY, tags).set( len(self.__workload_manager.get_workloads())) # Allocator metrics self.__reg.gauge(ALLOCATOR_CALL_DURATION, tags).set( self.__workload_manager.get_allocator_call_duration_sum_secs()) self.__reg.gauge(FALLBACK_ALLOCATOR_COUNT, tags).set( self.__workload_manager.get_fallback_allocator_calls_count()) self.__reg.gauge(IP_ALLOCATOR_TIMEBOUND_COUNT, tags).set( self.__workload_manager. get_time_bound_ip_allocator_solution_count()) # Event manager metrics self.__reg.gauge(QUEUE_DEPTH_KEY, tags).set(self.__event_manager.get_queue_depth()) self.__reg.gauge(EVENT_SUCCEEDED_KEY, tags).set( self.__event_manager.get_success_count()) self.__reg.gauge(EVENT_FAILED_KEY, tags).set(self.__event_manager.get_error_count()) self.__reg.gauge(EVENT_PROCESSED_KEY, tags).set( self.__event_manager.get_processed_count()) # CPU metrics cross_package_violation_count = len( get_cross_package_violations( self.__workload_manager.get_cpu())) shared_core_violation_count = len( get_shared_core_violations(self.__workload_manager.get_cpu())) self.__reg.gauge(PACKAGE_VIOLATIONS_KEY, tags).set(cross_package_violation_count) self.__reg.gauge(CORE_VIOLATIONS_KEY, tags).set(shared_core_violation_count) log.debug("Reported metrics") except: log.exception("Failed to report metric")
def init(): # Initialize currently running containers as workloads log.info("Isolating currently running workloads...") for workload in get_current_workloads(docker.from_env()): try: workload_manager.add_workload(workload) except: log.exception( "Failed to add currently running workload: '{}', maybe it exited." .format(workload.get_id())) log.info("Isolated currently running workloads.") # Start processing events after adding running workloads to avoid processing a die event before we add a workload event_manager.start_processing_events()
def get_duration_predictions(input_str: str) -> List[DurationPrediction]: try: # "0.05=0.29953;0.1=0.29953;0.15=0.29953;0.2=0.29953;0.25=0.29953;0.3=0.29953;0.35=0.29953;0.4=0.29953;0.45=0.29953;0.5=0.29953;0.55=0.29953;0.6=0.29953;0.65=0.29953;0.7=0.29953;0.75=0.29953;0.8=0.29953;0.85=0.29953;0.9=0.29953;0.95=0.29953" duration_predictions = [] pairs = input_str.split(';') for p in pairs: k, v = p.split('=') duration_predictions.append(DurationPrediction(float(k), float(v))) return duration_predictions except: log.exception( "Failed to parse duration predictions: '{}'".format(input_str)) return []
def __update_workload(self, func, arg, workload_id): try: with self.__lock: log.info("Acquired lock for func: {} on workload: {}".format(func.__name__, workload_id)) start_time = time.time() func(arg) stop_time = time.time() self.__allocator_call_duration_sum_secs = stop_time - start_time log.info("Released lock for func: {} on workload: {}".format(func.__name__, workload_id)) return True except: self.__error_count += 1 log.exception("Failed to execute func: {} on workload: {}".format(func.__name__, workload_id)) return False
def __set_cpuset(self, container_name, thread_ids): thread_ids_str = self.__get_thread_ids_str(thread_ids) try: self.__wait_for_files(container_name) log.info("updating workload: '{}' to cpuset.cpus: '{}'".format( container_name, thread_ids_str)) set_cpuset(container_name, thread_ids_str) self.__write_succeeded(container_name) except: self.__write_failed() log.exception( "Failed to apply cpuset to threads: '{}' for workload: '{}'". format(thread_ids_str, container_name))
def __snapshot_usage_raw(self): try: # Avoid making a metrics query on a potentially empty dataset which causes the query command to fail, which # causes noisy logs which look like failures. workload_manager = get_workload_manager() if workload_manager is None or len( workload_manager.get_workloads()) == 0: log.info('No workloads so skipping pcp snapshot.') return instance_filter = "INVALID_INSTANCE_FILTER" if is_kubernetes(): instance_filter = '.*titus-executor.*.service' else: instance_filter = '/containers.slice/[0-9a-fA-F]{8}\-[0-9a-fA-F]{4}\-[0-9a-fA-F]{4}\-[0-9a-fA-F]{4}\-[0-9a-fA-F]{12}' # pmrep -a /var/log/pcp/pmlogger/$(hostname)/ -S -60m -t 1m -y s -o csv -i .*titus-executor.*.service cgroup.cpuacct.usage cgroup.memory.usage snapshot_cmd_fmt = """ pmrep -a {0} \ -S -{1}s \ -T -0s \ -t {2}s \ -y s \ -o csv \ -i {3} \ cgroup.cpuacct.usage \ cgroup.memory.usage \ titus.network.in.bytes \ titus.network.out.bytes \ titus.disk.bytes_used """ cmd_str = snapshot_cmd_fmt.format(get_pcp_archive_path(), self.__relative_start_sec, self.__interval_sec, instance_filter) log.info('Snapshoting usage from pcp: {}'.format(' '.join( cmd_str.split()))) byte_array = subprocess.check_output( cmd_str, shell=True, timeout=self.__query_timeout_sec) raw_csv_snapshot = byte_array.decode('utf-8') usages = get_resource_usage(raw_csv_snapshot, self.__interval_count, self.__interval_sec) with self.__lock: self.__usages = usages except: log.exception("Failed to snapshot pcp data or compute usages")
def __groom(self, event): if not event[ACTION] == CREATE: self.__groomed_q.put_nowait(event) return try: name = get_container_name(event) log.info("Grooming create event for: '{}'".format(name)) self.__file_manger.wait_for_files(name) log.info("Groomed create event for: '{}'".format(name)) self.__groomed_q.put_nowait(event) except: self.__error_event_count += 1 log.exception("Dropping CREATE event, because failed to wait for files for: '{}'".format(name))
def __watch(self): while True: try: instance_id = get_config_manager().get_str("EC2_INSTANCE_ID") field_selector = "spec.nodeName={}".format(instance_id) log.info("Watching pods with field selector: %s", field_selector) v1 = client.CoreV1Api() w = watch.Watch() for event in w.stream(v1.list_pod_for_all_namespaces, field_selector=field_selector): self.__handle_event(event) except: log.exception("pod watch thread failed")
def free_threads(): try: body = request.get_json() log.info("Processing free threads request: {}".format(body)) threads_request = get_threads_request(body) response = get_free_cpu_allocator().free_threads(threads_request) global free_threads_success_count free_threads_success_count += 1 return jsonify(response.to_dict()) except: log.exception("Failed to free threads") global free_threads_failure_count free_threads_failure_count += 1 return "Failed to free threads", 500
def rebalance(): try: body = request.get_json() log.info("Processing rebalance threads request: {}".format(body)) rebalance_request = get_rebalance_request(body) response = get_rebalance_cpu_allocator().rebalance(rebalance_request) global rebalance_success_count rebalance_success_count += 1 return jsonify(response.to_dict()) except: log.exception("Failed to rebalance") global rebalance_failure_count rebalance_failure_count += 1 return "Failed to rebalance", 500
def __process_events(self): while True: try: msg = self.__q.get() log.debug("Sending event log message: {}".format(msg)) response = send_event_msg(msg, self.__address) if response.status_code != 200: log.error("Re-enqueuing failed event log message: {}".format(response.content)) self.__retry_msg_count += 1 self.__q.put_nowait(msg) else: self.__succeeded_msg_count += 1 except: self.__failed_msg_count += 1 log.exception("Failed to process event log message.")
def _schedule_once(exit_handler: ExitHandler) -> float: try: log.debug("Running pending scheduled tasks.") schedule.run_pending() sleep_time = SCHEDULING_SLEEP_INTERVAL if schedule.next_run() is not None: sleep_time = schedule.idle_seconds() if sleep_time < 0: sleep_time = SCHEDULING_SLEEP_INTERVAL return sleep_time except: log.exception("Failed to run scheduling once") exit_handler.exit(SCHEDULE_ONCE_FAILURE_EXIT_CODE)
def update_numa_balancing(workload: Workload, cpu: Cpu): try: config_manager = get_config_manager() dynamic_numa_balancing_enabled = config_manager.get_bool( TITUS_ISOLATE_DYNAMIC_NUMA_BALANCING, DEFAULT_TITUS_ISOLATE_DYNAMIC_NUMA_BALANCING) if not dynamic_numa_balancing_enabled: enable_numa_balancing() return if _occupies_entire_cpu(workload, cpu): disable_numa_balancing() else: enable_numa_balancing() except: log.exception("Failed to update NUMA balancing.")
def rebalance(): try: request_ip = request.headers.get(FORWARDED_FOR_HEADER) log.info("Processing rebalance threads request (from, proxy): {}".format(request_ip)) body = request.get_json() rebalance_request = get_rebalance_request(body) response = get_rebalance_cpu_allocator().rebalance(rebalance_request) global rebalance_success_count rebalance_success_count += 1 log.info("Processed rebalance threads request (from, proxy): {}".format(request_ip)) return jsonify(response.to_dict()) except: log.exception("Failed to rebalance") global rebalance_failure_count rebalance_failure_count += 1 return "Failed to rebalance", 500
def get_current_workloads(docker_client): workloads = [] for container in docker_client.containers.list(): workload_id = container.name if __has_required_labels(container): try: labels = container.labels cpu = int(__get_value(labels, CPU_LABEL_KEY, -1)) mem = int(__get_value(labels, MEM_LABEL_KEY, -1)) disk = int(__get_value(labels, DISK_LABEL_KEY, -1)) network = int(__get_value(labels, DISK_LABEL_KEY, -1)) app_name = __get_value(labels, APP_NAME_LABEL_KEY) owner_email = __get_value(labels, OWNER_EMAIL_LABEL_KEY) command = __get_value(labels, COMMAND_LABEL_KEY) entrypoint = __get_value(labels, ENTRYPOINT_LABEL_KEY) job_type = __get_value(labels, JOB_TYPE_LABEL_KEY) workload_type = __get_value(labels, WORKLOAD_TYPE_LABEL_KEY) image = __get_image(container) workloads.append( Workload(identifier=workload_id, thread_count=cpu, mem=mem, disk=disk, network=network, app_name=app_name, owner_email=owner_email, image=image, command=command, entrypoint=entrypoint, job_type=job_type, workload_type=workload_type)) log.info("Found running workload: '{}'".format(workload_id)) except: log.exception( "Failed to parse labels for container: '{}'".format( container.name)) else: log.warning( "Found running workload: '{}' without expected labels'") return workloads
def __process_events(self): while not self.__stopped: try: event = self.__groomed_q.get(timeout=self.__event_timeout) except Empty: log.debug("Timed out waiting for event on queue.") continue for event_handler in self.__event_handlers: try: event_handler.handle(event) self.__success_event_count += 1 except: log.exception("Event handler: '{}' failed to handle event: '{}'".format( type(event_handler).__name__, event)) self.__error_event_count += 1 self.__raw_q.task_done() self.__processed_event_count += 1 log.debug("processed event count: {}".format(self.get_success_count()))
def get_current_workloads(docker_client): workloads = [] for container in docker_client.containers.list(): workload_id = container.name if __has_required_labels(container): try: cpu = int(container.labels[CPU_LABEL_KEY]) workload_type = container.labels[WORKLOAD_TYPE_LABEL_KEY] workloads.append(Workload(workload_id, cpu, workload_type)) log.info("Found running workload: '{}'".format(workload_id)) except: log.exception( "Failed to parse labels for container: '{}'".format( container.name)) else: log.warning( "Found running workload: '{}' without expected label: '{}'". format(workload_id, CPU_LABEL_KEY)) return workloads