def delete_poddefaults_from_user_namespaces( poddefaults: List[Dict[str, Any]], user_namespaces: List[str], client: DynamicClient, logger: kopf.Logger, ) -> None: logger.debug( "Deleting PodDefaults %s from user Namespaces %s", [pd["metadata"]["name"] for pd in poddefaults], user_namespaces, ) for poddefault in poddefaults: for namespace in user_namespaces: try: delete_poddefault( namespace=namespace, name=poddefault["metadata"]["name"], client=client, logger=logger, ) except Exception as e: logger.warn( "Unable to delete PodDefault %s from Namespace %s: %s", poddefault["metadata"]["name"], namespace, str(e), )
def _remove_team_resources(namespace: str, team_spec: str, logger: kopf.Logger, **_: Any): # type: ignore v1 = CoreV1Api() logger.info(f"_remove_team_resources looking with orbit/label={team_spec}") # Get all the namespaces with the team label label_selector = f"orbit/team={team_spec}" all_namespaces = v1.list_namespace(label_selector=label_selector).to_dict() all_ns = [ item.get("metadata").get("name") for item in all_namespaces["items"] if item.get("metadata", {}).get("name") ] # List all the resources we want to force-delete: # group, version, plural, status_element custom_object_list = [ ["sagemaker.aws.amazon.com", "v1", "hyperparametertuningjobs", "trainingJobStatus"], ["sagemaker.aws.amazon.com", "v1", "trainingjobs", "trainingJobStatus"], ["sagemaker.aws.amazon.com", "v1", "batchtransformjobs", "transformJobStatus"], ["sagemaker.aws.amazon.com", "v1", "hostingdeployments", "status"], ["kubeflow.org", "v1", "notebooks", "NA"], ["kubeflow.org", "v1", "profile", "NA"], ["batch", "v1", "jobs", "NA"], ["apps", "v1", "deployments", "NA"], ["apps", "v1", "statefulsets", "NA"], ] for namespace in all_ns: logger.info(f"Looking at NS {namespace}") for co in custom_object_list: _delete_custom_objects(group=co[0], version=co[1], plural=co[2], namespace=namespace, logger=logger) _delete_pods(namespace=namespace, logger=logger) for co in custom_object_list[0:4]: _patch_and_delete_stubborn_custom_resources( group=co[0], version=co[1], plural=co[2], status_element=co[3], namespace=namespace, logger=logger )
def delete_userspace(namespace: str, name: str, client: dynamic.DynamicClient, logger: kopf.Logger) -> None: api = client.resources.get(api_version=ORBIT_API_VERSION, group=ORBIT_API_GROUP, kind="UserSpace") api.delete(namespace=namespace, name=name, body={}) logger.debug("Deleted UserSpace: %s in Namesapce: %s", name, namespace)
def scheduler(status: kopf.Status, patch: kopf.Patch, logger: kopf.Logger, **_: Any) -> str: replication = status.get("replication", {}) replication["codeBuildStatus"] = None replication["codeBuildPhase"] = None replication["codeBuildId"] = None attempt = replication.get("attempt", 0) + 1 if attempt > CONFIG["max_replication_attempts"]: replication["replicationStatus"] = "MaxAttemptsExceeded" replication["attempt"] = attempt patch["status"] = {"replication": replication} else: with LOCK: global WORKERS_IN_PROCESS logger.debug("WORKERS_IN_PROCESS: %s", WORKERS_IN_PROCESS) if WORKERS_IN_PROCESS < CONFIG["workers"]: WORKERS_IN_PROCESS += 1 replication["replicationStatus"] = "Scheduled" replication["attempt"] = attempt patch["status"] = {"replication": replication} logger.info("Schedule Attempt: %s", replication["attempt"]) return cast(str, replication["replicationStatus"])
def codebuild_runner( spec: kopf.Spec, patch: kopf.Patch, status: kopf.Status, logger: kopf.Logger, **_: Any, ) -> str: replication = status.get("replication", {}) build_id, error = imagereplication_utils.replicate_image( src=spec["source"], dest=spec["destination"], config=CONFIG) replication["replicationStatus"] = "Replicating" replication["codeBuildId"] = build_id if error: replication["replicationStatus"] = "Failed" replication["failureDelay"] = 30 with LOCK: global WORKERS_IN_PROCESS WORKERS_IN_PROCESS -= 1 patch["status"] = {"replication": replication} if error: logger.error("CodeBuildId: %s Error: %s", build_id, error) else: logger.info("CodeBuildId: %s Error: %s", build_id, error) return cast(str, replication["replicationStatus"])
def delete_poddefaults( namespace: str, name: str, labels: kopf.Labels, spec: kopf.Spec, logger: kopf.Logger, namespaces_idx: kopf.Index[str, Dict[str, Any]], **_: Any, ) -> str: team = labels.get("orbit/team", None) if team is None: logger.error("Missing required orbit/team label") return "MissingTeam" # Contruct a pseudo poddefault for the team to be deleted from users poddefault = poddefault_utils.construct( name=name, desc=spec.get("desc", ""), labels={ "orbit/space": "team", "orbit/team": team }, ) user_namespaces = [ namespace["name"] for namespace in namespaces_idx.get(team, []) ] poddefault_utils.delete_poddefaults_from_user_namespaces( poddefaults=[poddefault], user_namespaces=user_namespaces, client=dynamic_client(), logger=logger, ) return "PodDefaultsDeleted"
def codebuild_monitor(status: kopf.Status, patch: kopf.Patch, logger: kopf.Logger, **_: Any) -> str: replication = status.get("replication", {}) build_id = replication.get("codeBuildId", None) client = boto3.client("codebuild") build = client.batch_get_builds(ids=[build_id])["builds"][0] replication["codeBuildStatus"] = build["buildStatus"] replication["codeBuildPhase"] = build["currentPhase"] if replication["codeBuildStatus"] not in "IN_PROGRESS": logger.info("CodeBuildId: %s BuildStatus: %s", build_id, replication["codeBuildStatus"]) with LOCK: global WORKERS_IN_PROCESS WORKERS_IN_PROCESS -= 1 codebuild_attempts = replication.get("codeBuildAttempts", []) codebuild_attempts.append({ "codeBuildId": build_id, "codeBuildStatus": build["buildStatus"], "codeBuildPhase": build["currentPhase"], }) replication["codeBuildAttempts"] = codebuild_attempts replication["replicationStatus"] = "Complete" if build[ "buildStatus"] == "SUCCEEDED" else "Failed" if replication["replicationStatus"] == "Failed": replication["failureDelay"] = 30 patch["status"] = {"replication": replication} return cast(str, replication["codeBuildStatus"])
def modify_poddefault( namespace: str, name: str, desc: str, client: dynamic.DynamicClient, logger: kopf.Logger, ) -> None: api = client.resources.get(api_version=KUBEFLOW_API_VERSION, group=KUBEFLOW_API_GROUP, kind="PodDefault") patch = {"spec": {"desc": desc}} api.patch(namespace=namespace, name=name, body=patch) logger.debug("Modified PodDefault: %s in Namespace: %s", name, namespace)
def configure(settings: kopf.OperatorSettings, logger: kopf.Logger, **_: Any) -> None: settings.persistence.progress_storage = kopf.MultiProgressStorage( [ kopf.AnnotationsProgressStorage(prefix="orbit.aws"), kopf.StatusProgressStorage(field="status.orbit-aws"), ] ) settings.posting.level = logging.INFO settings.persistence.finalizer = "teamspace-operator.orbit.aws/kopf-finalizer" settings.posting.level = logging.getLevelName(os.environ.get("EVENT_LOG_LEVEL", "INFO")) logger.info("START the Teamspace Controller")
def create_poddefault( namespace: str, poddefault: Dict[str, Any], client: dynamic.DynamicClient, logger: kopf.Logger, ) -> None: api = client.resources.get(api_version=KUBEFLOW_API_VERSION, group=KUBEFLOW_API_GROUP, kind="PodDefault") api.create(namespace=namespace, body=poddefault) logger.debug( "Created PodDefault: %s in Namespace: %s", poddefault["metadata"]["name"], namespace, )
def _delete_pods(namespace: str, logger: kopf.Logger, use_async=True, **_: Any): # type: ignore logger.info(f"Deleting ALL PODS in ns {namespace}") api = CoreV1Api() try: api.delete_collection_namespaced_pod( namespace=namespace, async_req=use_async, grace_period_seconds=0, propagation_policy="Background", body=V1DeleteOptions(), ) except ApiException as e: logger.warn("calling CustomObjectsApi->delete_collection_namespaced_pod: %s\n" % e)
def modify_userspace( namespace: str, name: str, desc: str, client: dynamic.DynamicClient, logger: kopf.Logger, ) -> None: api = client.resources.get(api_version=ORBIT_API_VERSION, group=ORBIT_API_GROUP, kind="UserSpace") patch = {"spec": {"desc": desc}} api.patch(namespace=namespace, name=name, body=patch) logger.debug("Modified UserSpace: %s in Namespace: %s", name, namespace)
def _patch_and_delete_stubborn_custom_resources( # type: ignore group: str, version: str, plural: str, namespace: str, status_element: str, logger: kopf.Logger, use_async=True, **_: Any, ): logger.info(f"_patch_and_delete_stubborn_custom_resources for {plural}.{group} in namespace {namespace}") co = CustomObjectsApi() resp = co.list_namespaced_custom_object(group=group, version=version, plural=plural, namespace=namespace) failed_res = [ item.get("metadata").get("name") for item in resp["items"] if item.get("status", {}).get(status_element) in ["Failed", "Completed", "InProgress"] ] for item in failed_res: try: logger.info(f"Patching item {item} in {plural}.{group}") patch = json.loads("""{"metadata":{"finalizers":[]}}""") co.patch_namespaced_custom_object( group=group, version=version, plural=plural, namespace=namespace, name=item, body=patch ) logger.info(f"Deleting item {item} in {plural}.{group}") co.delete_namespaced_custom_object( group=group, version=version, plural=plural, namespace=namespace, name=item, ) except ApiException as e: logger.warn("Trying to patch and delete failed: %s\n" % e)
def orbit_job_monitor( namespace: str, name: str, patch: kopf.Patch, logger: kopf.Logger, namespaces_idx: kopf.Index[str, Dict[str, Any]], jobs_idx: kopf.Index[Tuple[str, str], Dict[str, Any]], **_: Any, ) -> Any: ns: Optional[Dict[str, Any]] = None k8s_job: Optional[Dict[str, Any]] = None for ns in namespaces_idx.get(namespace, []): logger.debug("ns: %s", ns) if ns is None: patch["status"] = { "orbitJobOperator": { "jobStatus": "JobDetailsNotFound", "error": "No Namespace resource found" } } return "JobDetailsNotFound" for k8s_job in jobs_idx.get((namespace, name), []): logger.debug("k8s_job: %s", k8s_job) if k8s_job is None: # To tackle the race condition caused by Timer return "JobMetadataNotFound" if k8s_job.get("status", {}).get("active") == 1: job_status = "Active" else: job_status = k8s_job.get("status", {}).get("conditions", [{}])[0].get("type") k8s_job_reason = k8s_job.get("status", {}).get("conditions", [{}])[0].get("status") k8s_job_message = k8s_job.get("status", {}).get("conditions", [{}])[0].get("message") patch["status"] = { "orbitJobOperator": { "jobStatus": job_status, "jobName": k8s_job.get("name"), "k8sJobReason": k8s_job_reason, "k8sJobMessage": k8s_job_message, } } return job_status
def rescheduler(status: kopf.Status, patch: kopf.Patch, logger: kopf.Logger, **_: Any) -> str: logger.debug("Rescheduling") replication = status.get("replication", {}) failure_delay = replication.get("failureDelay", 0) if failure_delay > 0: replication["failureDelay"] = failure_delay - 5 else: replication["replicationStatus"] = "Pending" replication["failureDelay"] = None patch["status"] = {"replication": replication} return "Rescheduled"
def create_userspace( namespace: str, userspace: Dict[str, Any], client: dynamic.DynamicClient, logger: kopf.Logger, ) -> None: api = client.resources.get(api_version=ORBIT_API_VERSION, group=ORBIT_API_GROUP, kind="UserSpace") api.create(namespace=namespace, body=userspace) logger.debug( "Created UserSpace: %s in Namespace: %s", userspace["metadata"]["name"], namespace, )
def uninstall_team(namespace: str, name: str, spec: kopf.Spec, patch: kopf.Patch, logger: kopf.Logger, **_: Any) -> str: logger.info("In UNINSTALL_TEAM Teamspace Controller") # spec: # env: ${env_name} # space: team # team: ${team} team_spec = spec.get("team", None) logger.info(f"Preparing to Destroy all resources in team namespace {namespace}") if team_spec: _remove_team_resources(namespace=namespace, team_spec=team_spec, logger=logger) _remove_user_namespaces(namespace=namespace, team_spec=team_spec, logger=logger) patch["status"] = {"teamspaceOperator": {"status": "DeleteProcessed"}} else: logging.warn("Team spec not found...moving on") return "Uninstalled"
def configure(settings: kopf.OperatorSettings, logger: kopf.Logger, **_: Any) -> None: settings.admission.server = kopf.WebhookServer( cafile="/certs/ca.crt", certfile="/certs/tls.crt", pkeyfile="/certs/tls.key", port=443, ) settings.persistence.progress_storage = kopf.MultiProgressStorage([ kopf.AnnotationsProgressStorage(prefix="orbit.aws"), kopf.StatusProgressStorage(field="status.orbit-aws"), ]) settings.persistence.finalizer = "imagereplication-pod-webhook.orbit.aws/kopf-finalizer" settings.posting.level = logging.getLevelName( os.environ.get("EVENT_LOG_LEVEL", "INFO")) global CONFIG CONFIG = imagereplication_utils.get_config() logger.info("CONFIG: %s", CONFIG)
def _uninstall_chart(helm_release: str, namespace: str, logger: kopf.Logger) -> bool: install_status = True cmd = f"/usr/local/bin/helm uninstall --debug --namespace {namespace} {helm_release}" try: logger.debug("running uninstall cmd: %s", cmd) output = run_command(cmd) logger.debug(output) logger.info("finished uninstall cmd: %s", cmd) except Exception: logger.error("errored cmd: %s", cmd) install_status = False return install_status
def _delete_user_efs_endpoint(user_name: str, user_namespace: str, logger: kopf.Logger, meta: kopf.Meta) -> None: efs = boto3.client("efs") logger.info( f"Fetching the EFS access point in the namespace {user_namespace} for user {user_name}" ) efs_access_point_id = meta.get("labels", {}).get("userEfsApId", None) logger.info( f"Deleting the EFS access point {efs_access_point_id} for user {user_name}" ) try: efs.delete_access_point(AccessPointId=efs_access_point_id) logger.info(f"Access point {efs_access_point_id} deleted") except efs.exceptions.AccessPointNotFound: logger.warning(f"Access point not found: {efs_access_point_id}") except efs.exceptions.InternalServerError as e: logger.warning(e)
def replication_checker( spec: kopf.Spec, status: kopf.Status, patch: kopf.Patch, logger: kopf.Logger, **_: Any, ) -> str: if status.get("replication", None) is not None: return cast(str, status["replication"].get("replicationStatus", "Unknown")) replication = {} if imagereplication_utils.image_replicated(image=spec["destination"], logger=logger): logger.info("Skipped: Image previously replicated to ECR") replication["replicationStatus"] = "ECRImageExists" else: logger.info("Starting Replication") replication["replicationStatus"] = "Pending" patch["status"] = {"replication": replication} return replication["replicationStatus"]
def update_pod_images( spec: kopf.Spec, patch: kopf.Patch, dryrun: bool, logger: kopf.Logger, imagereplications_idx: kopf.Index[str, str], **_: Any, ) -> kopf.Patch: if dryrun: logger.debug("DryRun - Skip Pod Mutation") return patch annotations = {} init_containers: List[Dict[str, Any]] = [] containers: List[Dict[str, Any]] = [] replications = {} def process_containers(src_containers: Optional[List[Dict[str, Any]]], dest_containers: List[Dict[str, Any]]) -> None: for container in src_containers if src_containers else []: image = container.get("image", "") desired_image = imagereplication_utils.get_desired_image( image=image, config=CONFIG) if image != desired_image: container_copy = deepcopy(container) container_copy["image"] = desired_image dest_containers.append(container_copy) replications[image] = desired_image annotations[ f"original-container-image~1{container['name']}"] = image process_containers(spec.get("initContainers", []), init_containers) process_containers(spec.get("containers", []), containers) if replications: client = dynamic_client() for source, destination in replications.items(): if not imagereplications_idx.get(destination, []): imagereplication_utils.create_imagereplication( namespace="orbit-system", source=source, destination=destination, client=client, logger=logger, ) else: logger.debug("Skipping ImageReplication Creation") if annotations: patch["metadata"] = {"annotations": annotations} patch["spec"] = {} if init_containers: patch["spec"]["initContainers"] = init_containers if containers: patch["spec"]["containers"] = containers logger.debug("Patch: %s", str(patch)) return patch
def create_poddefaults( namespace: str, name: str, labels: kopf.Labels, spec: kopf.Spec, status: kopf.Status, patch: kopf.Patch, logger: kopf.Logger, namespaces_idx: kopf.Index[str, Dict[str, Any]], **_: Any, ) -> str: team = labels.get("orbit/team", None) if team is None: logger.error("Missing required orbit/team label") patch["status"] = {"podDefaultsCreation": "MissingTeam"} return "MissingTeam" # Contruct a pseudo poddefault for the team to be copied to users poddefault = poddefault_utils.construct( name=name, desc=spec.get("desc", ""), labels={ "orbit/space": "team", "orbit/team": team }, ) user_namespaces = [ns.get("name") for ns in namespaces_idx.get(team, [])] poddefault_utils.copy_poddefaults_to_user_namespaces( poddefaults=[poddefault], user_namespaces=user_namespaces, client=dynamic_client(), logger=logger, ) patch["status"] = {"podDefaultsCreation": "Complete"} return "PodDefaultsCreated"
def _get_team_context(team: str, logger: kopf.Logger) -> Dict[str, Any]: try: api_instance = CoreV1Api() team_context_cf: V1ConfigMap = api_instance.read_namespaced_config_map( "orbit-team-context", team) team_context_str = team_context_cf.data["team"] logger.debug("team context: %s", team_context_str) team_context: Dict[str, Any] = json.loads(team_context_str) logger.debug("team context keys: %s", team_context.keys()) except Exception as e: logger.error("Error during fetching team context configmap") raise e return team_context
def _remove_user_namespaces(namespace: str, team_spec: str, logger: kopf.Logger, **_: Any): # type: ignore logger.info( f"Removing all user namespaces with labels orbit/team={team_spec},orbit/space=user in namespace {namespace} " ) v1 = CoreV1Api() label_selector = f"orbit/team={team_spec},orbit/space=user" all_namespaces = v1.list_namespace(label_selector=label_selector).to_dict() all_ns = [ item.get("metadata").get("name") for item in all_namespaces["items"] if item.get("metadata", {}).get("name") ] for ns in all_ns: logger.info(f"Calling delete namespace {ns}") try: v1.delete_namespace(name=ns, async_req=True) except ApiException as e: logger.warn("calling CoreV1API->delete_namespace had an error: %s\n" % e)
def copy_poddefaults_to_user_namespaces( poddefaults: List[Dict[str, Any]], user_namespaces: List[str], client: DynamicClient, logger: kopf.Logger, ) -> None: logger.debug( "Copying PodDefaults %s to user Namespaces %s", [pd["metadata"]["name"] for pd in poddefaults], user_namespaces, ) for poddefault in poddefaults: for namespace in user_namespaces: try: kwargs = { "name": poddefault["metadata"]["name"], "desc": poddefault["spec"]["desc"], "labels": { "orbit/space": "user", "orbit/team": poddefault["metadata"]["labels"].get("orbit/team", None), }, } create_poddefault( namespace=namespace, poddefault=construct(**kwargs), client=client, logger=logger, ) except ApiException as e: logger.warning( "Unable to create PodDefault %s in Namespace %s: %s", poddefault["metadata"]["name"], namespace, str(e.body), ) except Exception as e: logger.warning( "Failed to create PodDefault", str(e), )
def _delete_custom_objects( # type: ignore group: str, version: str, plural: str, namespace: str, logger: kopf.Logger, use_async=True, **_: Any ): logger.info(f"Deleting {plural}.{group} in ns {namespace}") co = CustomObjectsApi() try: resp = co.delete_collection_namespaced_custom_object( group=group, version=version, namespace=namespace, plural=plural, grace_period_seconds=0, propagation_policy="Background", pretty="true", async_req=use_async, body=V1DeleteOptions(), ) return resp except ApiException as e: logger.warn("calling CustomObjectsApi->delete_collection_namespaced_custom_object: %s\n" % e) logger.warn("Assume it did not exist")
def create_job( namespace: str, name: str, labels: kopf.Labels, annotations: kopf.Annotations, spec: kopf.Spec, status: kopf.Status, patch: kopf.Patch, logger: kopf.Logger, namespaces_idx: kopf.Index[str, Dict[str, Any]], podsettings_idx: kopf.Index[Tuple[str, str], Dict[str, Any]], **_: Any, ) -> str: ns: Optional[Dict[str, Any]] = None for ns in namespaces_idx.get(namespace, []): logger.debug("ns: %s", ns) if ns is None: patch["status"] = { "orbitJobOperator": { "jobStatus": "JobCreationFailed", "error": "No Namespace resource found" } } return "JobCreationFailed" env = ns["env"] team = ns["team"] global ENV_CONTEXT # Caching if ENV_CONTEXT is None: context = _load_env_context_from_ssm(env) if context is None: patch["status"] = { "orbitJobOperator": { "jobStatus": "JobCreationFailed", "error": "Unable to load Env Context from SSM" } } return "JobCreationFailed" else: ENV_CONTEXT = context node_type = spec.get("compute", {}).get("nodeType", "fargate") labels = { "app": "orbit-runner", "orbit/node-type": node_type, "notebook-name": spec.get("notebookName", ""), "orbit/attach-security-group": "yes" if node_type == "ec2" else "no", } podsetting_metadata: Dict[str, Any] = {} for podsetting_metadata in podsettings_idx.get( (team, spec.get("compute", {}).get("podSetting", None)), []): logger.debug("PodSetting: %s", podsetting_metadata) job_spec = job_utils.construct_job_spec( env=env, team=team, env_context=ENV_CONTEXT, podsetting_metadata=podsetting_metadata, orbit_job_spec=spec, labels=labels, ) logger.debug("spec: %s", spec) if spec.get("schedule"): cronjob_id = f"orbit-{namespace}-{spec.get('triggerName')}" cron_job_template: V1beta1JobTemplateSpec = V1beta1JobTemplateSpec( spec=job_spec) cron_job_spec: V1beta1CronJobSpec = V1beta1CronJobSpec( job_template=cron_job_template, schedule=spec.get("schedule")) job = V1beta1CronJob( api_version="batch/v1beta1", kind="CronJob", metadata=V1ObjectMeta(name=cronjob_id, labels={ **labels, **spec.get("compute", {}).get( "labels", {}) }, namespace=namespace), status=V1beta1CronJobStatus(), spec=cron_job_spec, ) kopf.adopt(job, nested="spec.template") cron_job_instance: V1beta1CronJob = BatchV1beta1Api( ).create_namespaced_cron_job(namespace=namespace, body=job) cronjob_instance_metadata: V1ObjectMeta = cron_job_instance.metadata logger.debug("Started Cron Job: %s", cronjob_instance_metadata.name) patch["metadata"] = {"labels": {"k8sJobType": "CronJob"}} patch["status"] = { "orbitJobOperator": { "jobStatus": "JobCreated", "jobName": cronjob_instance_metadata.name, "nodeType": node_type, } } return "CronJobCreated" else: job = V1Job( api_version="batch/v1", kind="Job", metadata=V1ObjectMeta(labels={ **labels, **spec.get("compute", {}).get("labels", {}) }), spec=job_spec, ) kopf.adopt(job, nested="spec.template") job_instance: V1Job = BatchV1Api().create_namespaced_job( namespace=namespace, body=job) job_instance_metadata: V1ObjectMeta = job_instance.metadata logger.debug("Started Job: %s", job_instance_metadata.name) patch["metadata"] = {"labels": {"k8sJobType": "Job"}} patch["status"] = { "orbitJobOperator": { "jobStatus": "JobCreated", "jobName": job_instance_metadata.name, "nodeType": node_type, } } return "JobCreated"
def delete_poddefault(namespace: str, name: str, client: dynamic.DynamicClient, logger: kopf.Logger) -> None: api = client.resources.get(api_version=KUBEFLOW_API_VERSION, group=KUBEFLOW_API_GROUP, kind="PodDefault") api.delete(namespace=namespace, name=name, body={}) logger.debug("Deleted PodDefault: %s in Namesapce: %s", name, namespace)
def orbit_cron_job_monitor( namespace: str, name: str, patch: kopf.Patch, status: kopf.Status, logger: kopf.Logger, namespaces_idx: kopf.Index[str, Dict[str, Any]], cron_jobs_idx: kopf.Index[Tuple[str, str], Dict[str, Any]], **_: Any, ) -> Any: ns: Optional[Dict[str, Any]] = None k8s_job: Optional[Dict[str, Any]] = None for ns in namespaces_idx.get(namespace, []): logger.debug("ns: %s", ns) if ns is None: patch["status"] = { "orbitJobOperator": { "jobStatus": "JobDetailsNotFound", "error": "No Namespace resource found" } } return "JobDetailsNotFound" logger.debug("cron_jobs_idx: %s", cron_jobs_idx) for k8s_job in cron_jobs_idx.get((namespace, name), []): logger.debug("k8s_job: %s", k8s_job) if k8s_job is None: # To tackle the race condition caused by Timer return "JobMetadataNotFound" if not k8s_job.get("status", {}): cron_job_status = "Activating" else: cron_job_status = "Active" if k8s_job.get("status"): for i in k8s_job.get("status", {}).get("active", [{}]): if i.get("name") not in status.get("orbitJobOperator", {}).get("cronJobIds", []): cron_job_ids: List[str] = status.get("orbitJobOperator", {}).get("cronJobIds", []) cron_job_ids.append(i.get("name")) patch["status"] = { "orbitJobOperator": { "jobStatus": cron_job_status, "jobName": k8s_job.get("name"), "cronJobIds": cron_job_ids, } } else: return cron_job_status else: patch["status"] = { "orbitJobOperator": { "jobStatus": cron_job_status, "jobName": k8s_job.get("name"), "cronJobIds": status.get("orbitJobOperator", {}).get("cronJobIds", []), } } return cron_job_status