示例#1
0
def delete_poddefaults_from_user_namespaces(
    poddefaults: List[Dict[str, Any]],
    user_namespaces: List[str],
    client: DynamicClient,
    logger: kopf.Logger,
) -> None:
    logger.debug(
        "Deleting PodDefaults %s from user Namespaces %s",
        [pd["metadata"]["name"] for pd in poddefaults],
        user_namespaces,
    )
    for poddefault in poddefaults:
        for namespace in user_namespaces:
            try:
                delete_poddefault(
                    namespace=namespace,
                    name=poddefault["metadata"]["name"],
                    client=client,
                    logger=logger,
                )
            except Exception as e:
                logger.warn(
                    "Unable to delete PodDefault %s from Namespace %s: %s",
                    poddefault["metadata"]["name"],
                    namespace,
                    str(e),
                )
def _remove_team_resources(namespace: str, team_spec: str, logger: kopf.Logger, **_: Any):  # type: ignore
    v1 = CoreV1Api()
    logger.info(f"_remove_team_resources looking with orbit/label={team_spec}")
    # Get all the namespaces with the team label
    label_selector = f"orbit/team={team_spec}"
    all_namespaces = v1.list_namespace(label_selector=label_selector).to_dict()
    all_ns = [
        item.get("metadata").get("name") for item in all_namespaces["items"] if item.get("metadata", {}).get("name")
    ]
    # List all the resources we want to force-delete:
    # group, version, plural, status_element
    custom_object_list = [
        ["sagemaker.aws.amazon.com", "v1", "hyperparametertuningjobs", "trainingJobStatus"],
        ["sagemaker.aws.amazon.com", "v1", "trainingjobs", "trainingJobStatus"],
        ["sagemaker.aws.amazon.com", "v1", "batchtransformjobs", "transformJobStatus"],
        ["sagemaker.aws.amazon.com", "v1", "hostingdeployments", "status"],
        ["kubeflow.org", "v1", "notebooks", "NA"],
        ["kubeflow.org", "v1", "profile", "NA"],
        ["batch", "v1", "jobs", "NA"],
        ["apps", "v1", "deployments", "NA"],
        ["apps", "v1", "statefulsets", "NA"],
    ]

    for namespace in all_ns:
        logger.info(f"Looking at NS {namespace}")

        for co in custom_object_list:
            _delete_custom_objects(group=co[0], version=co[1], plural=co[2], namespace=namespace, logger=logger)
        _delete_pods(namespace=namespace, logger=logger)

        for co in custom_object_list[0:4]:
            _patch_and_delete_stubborn_custom_resources(
                group=co[0], version=co[1], plural=co[2], status_element=co[3], namespace=namespace, logger=logger
            )
示例#3
0
def delete_userspace(namespace: str, name: str, client: dynamic.DynamicClient,
                     logger: kopf.Logger) -> None:
    api = client.resources.get(api_version=ORBIT_API_VERSION,
                               group=ORBIT_API_GROUP,
                               kind="UserSpace")
    api.delete(namespace=namespace, name=name, body={})
    logger.debug("Deleted UserSpace: %s in Namesapce: %s", name, namespace)
示例#4
0
def scheduler(status: kopf.Status, patch: kopf.Patch, logger: kopf.Logger,
              **_: Any) -> str:
    replication = status.get("replication", {})
    replication["codeBuildStatus"] = None
    replication["codeBuildPhase"] = None
    replication["codeBuildId"] = None

    attempt = replication.get("attempt", 0) + 1
    if attempt > CONFIG["max_replication_attempts"]:
        replication["replicationStatus"] = "MaxAttemptsExceeded"
        replication["attempt"] = attempt

        patch["status"] = {"replication": replication}
    else:
        with LOCK:
            global WORKERS_IN_PROCESS
            logger.debug("WORKERS_IN_PROCESS: %s", WORKERS_IN_PROCESS)
            if WORKERS_IN_PROCESS < CONFIG["workers"]:
                WORKERS_IN_PROCESS += 1
                replication["replicationStatus"] = "Scheduled"
                replication["attempt"] = attempt

                patch["status"] = {"replication": replication}
                logger.info("Schedule Attempt: %s", replication["attempt"])

    return cast(str, replication["replicationStatus"])
示例#5
0
def codebuild_runner(
    spec: kopf.Spec,
    patch: kopf.Patch,
    status: kopf.Status,
    logger: kopf.Logger,
    **_: Any,
) -> str:
    replication = status.get("replication", {})

    build_id, error = imagereplication_utils.replicate_image(
        src=spec["source"], dest=spec["destination"], config=CONFIG)

    replication["replicationStatus"] = "Replicating"
    replication["codeBuildId"] = build_id

    if error:
        replication["replicationStatus"] = "Failed"
        replication["failureDelay"] = 30
        with LOCK:
            global WORKERS_IN_PROCESS
            WORKERS_IN_PROCESS -= 1

    patch["status"] = {"replication": replication}
    if error:
        logger.error("CodeBuildId: %s Error: %s", build_id, error)
    else:
        logger.info("CodeBuildId: %s Error: %s", build_id, error)

    return cast(str, replication["replicationStatus"])
def delete_poddefaults(
    namespace: str,
    name: str,
    labels: kopf.Labels,
    spec: kopf.Spec,
    logger: kopf.Logger,
    namespaces_idx: kopf.Index[str, Dict[str, Any]],
    **_: Any,
) -> str:
    team = labels.get("orbit/team", None)
    if team is None:
        logger.error("Missing required orbit/team label")
        return "MissingTeam"

    # Contruct a pseudo poddefault for the team to be deleted from users
    poddefault = poddefault_utils.construct(
        name=name,
        desc=spec.get("desc", ""),
        labels={
            "orbit/space": "team",
            "orbit/team": team
        },
    )
    user_namespaces = [
        namespace["name"] for namespace in namespaces_idx.get(team, [])
    ]
    poddefault_utils.delete_poddefaults_from_user_namespaces(
        poddefaults=[poddefault],
        user_namespaces=user_namespaces,
        client=dynamic_client(),
        logger=logger,
    )

    return "PodDefaultsDeleted"
示例#7
0
def codebuild_monitor(status: kopf.Status, patch: kopf.Patch,
                      logger: kopf.Logger, **_: Any) -> str:
    replication = status.get("replication", {})

    build_id = replication.get("codeBuildId", None)

    client = boto3.client("codebuild")
    build = client.batch_get_builds(ids=[build_id])["builds"][0]
    replication["codeBuildStatus"] = build["buildStatus"]
    replication["codeBuildPhase"] = build["currentPhase"]

    if replication["codeBuildStatus"] not in "IN_PROGRESS":
        logger.info("CodeBuildId: %s BuildStatus: %s", build_id,
                    replication["codeBuildStatus"])
        with LOCK:
            global WORKERS_IN_PROCESS
            WORKERS_IN_PROCESS -= 1
        codebuild_attempts = replication.get("codeBuildAttempts", [])
        codebuild_attempts.append({
            "codeBuildId": build_id,
            "codeBuildStatus": build["buildStatus"],
            "codeBuildPhase": build["currentPhase"],
        })
        replication["codeBuildAttempts"] = codebuild_attempts
        replication["replicationStatus"] = "Complete" if build[
            "buildStatus"] == "SUCCEEDED" else "Failed"

    if replication["replicationStatus"] == "Failed":
        replication["failureDelay"] = 30

    patch["status"] = {"replication": replication}
    return cast(str, replication["codeBuildStatus"])
示例#8
0
def modify_poddefault(
    namespace: str,
    name: str,
    desc: str,
    client: dynamic.DynamicClient,
    logger: kopf.Logger,
) -> None:
    api = client.resources.get(api_version=KUBEFLOW_API_VERSION, group=KUBEFLOW_API_GROUP, kind="PodDefault")
    patch = {"spec": {"desc": desc}}
    api.patch(namespace=namespace, name=name, body=patch)
    logger.debug("Modified PodDefault: %s in Namespace: %s", name, namespace)
def configure(settings: kopf.OperatorSettings, logger: kopf.Logger, **_: Any) -> None:
    settings.persistence.progress_storage = kopf.MultiProgressStorage(
        [
            kopf.AnnotationsProgressStorage(prefix="orbit.aws"),
            kopf.StatusProgressStorage(field="status.orbit-aws"),
        ]
    )
    settings.posting.level = logging.INFO
    settings.persistence.finalizer = "teamspace-operator.orbit.aws/kopf-finalizer"
    settings.posting.level = logging.getLevelName(os.environ.get("EVENT_LOG_LEVEL", "INFO"))
    logger.info("START the Teamspace Controller")
示例#10
0
def create_poddefault(
    namespace: str,
    poddefault: Dict[str, Any],
    client: dynamic.DynamicClient,
    logger: kopf.Logger,
) -> None:
    api = client.resources.get(api_version=KUBEFLOW_API_VERSION, group=KUBEFLOW_API_GROUP, kind="PodDefault")
    api.create(namespace=namespace, body=poddefault)
    logger.debug(
        "Created PodDefault: %s in Namespace: %s",
        poddefault["metadata"]["name"],
        namespace,
    )
def _delete_pods(namespace: str, logger: kopf.Logger, use_async=True, **_: Any):  # type: ignore
    logger.info(f"Deleting ALL PODS in ns {namespace}")
    api = CoreV1Api()
    try:
        api.delete_collection_namespaced_pod(
            namespace=namespace,
            async_req=use_async,
            grace_period_seconds=0,
            propagation_policy="Background",
            body=V1DeleteOptions(),
        )
    except ApiException as e:
        logger.warn("calling CustomObjectsApi->delete_collection_namespaced_pod: %s\n" % e)
示例#12
0
def modify_userspace(
    namespace: str,
    name: str,
    desc: str,
    client: dynamic.DynamicClient,
    logger: kopf.Logger,
) -> None:
    api = client.resources.get(api_version=ORBIT_API_VERSION,
                               group=ORBIT_API_GROUP,
                               kind="UserSpace")
    patch = {"spec": {"desc": desc}}
    api.patch(namespace=namespace, name=name, body=patch)
    logger.debug("Modified UserSpace: %s in Namespace: %s", name, namespace)
def _patch_and_delete_stubborn_custom_resources(  # type: ignore
    group: str,
    version: str,
    plural: str,
    namespace: str,
    status_element: str,
    logger: kopf.Logger,
    use_async=True,
    **_: Any,
):
    logger.info(f"_patch_and_delete_stubborn_custom_resources for {plural}.{group} in namespace {namespace}")
    co = CustomObjectsApi()
    resp = co.list_namespaced_custom_object(group=group, version=version, plural=plural, namespace=namespace)
    failed_res = [
        item.get("metadata").get("name")
        for item in resp["items"]
        if item.get("status", {}).get(status_element) in ["Failed", "Completed", "InProgress"]
    ]
    for item in failed_res:
        try:
            logger.info(f"Patching item {item} in {plural}.{group}")
            patch = json.loads("""{"metadata":{"finalizers":[]}}""")
            co.patch_namespaced_custom_object(
                group=group, version=version, plural=plural, namespace=namespace, name=item, body=patch
            )
            logger.info(f"Deleting item {item} in {plural}.{group}")
            co.delete_namespaced_custom_object(
                group=group,
                version=version,
                plural=plural,
                namespace=namespace,
                name=item,
            )
        except ApiException as e:
            logger.warn("Trying to patch and delete failed: %s\n" % e)
def orbit_job_monitor(
    namespace: str,
    name: str,
    patch: kopf.Patch,
    logger: kopf.Logger,
    namespaces_idx: kopf.Index[str, Dict[str, Any]],
    jobs_idx: kopf.Index[Tuple[str, str], Dict[str, Any]],
    **_: Any,
) -> Any:
    ns: Optional[Dict[str, Any]] = None
    k8s_job: Optional[Dict[str, Any]] = None

    for ns in namespaces_idx.get(namespace, []):
        logger.debug("ns: %s", ns)

    if ns is None:
        patch["status"] = {
            "orbitJobOperator": {
                "jobStatus": "JobDetailsNotFound",
                "error": "No Namespace resource found"
            }
        }
        return "JobDetailsNotFound"

    for k8s_job in jobs_idx.get((namespace, name), []):
        logger.debug("k8s_job: %s", k8s_job)

    if k8s_job is None:  # To tackle the race condition caused by Timer
        return "JobMetadataNotFound"

    if k8s_job.get("status", {}).get("active") == 1:
        job_status = "Active"
    else:
        job_status = k8s_job.get("status", {}).get("conditions",
                                                   [{}])[0].get("type")

    k8s_job_reason = k8s_job.get("status", {}).get("conditions",
                                                   [{}])[0].get("status")
    k8s_job_message = k8s_job.get("status", {}).get("conditions",
                                                    [{}])[0].get("message")

    patch["status"] = {
        "orbitJobOperator": {
            "jobStatus": job_status,
            "jobName": k8s_job.get("name"),
            "k8sJobReason": k8s_job_reason,
            "k8sJobMessage": k8s_job_message,
        }
    }
    return job_status
示例#15
0
def rescheduler(status: kopf.Status, patch: kopf.Patch, logger: kopf.Logger,
                **_: Any) -> str:
    logger.debug("Rescheduling")
    replication = status.get("replication", {})
    failure_delay = replication.get("failureDelay", 0)

    if failure_delay > 0:
        replication["failureDelay"] = failure_delay - 5
    else:
        replication["replicationStatus"] = "Pending"
        replication["failureDelay"] = None

    patch["status"] = {"replication": replication}
    return "Rescheduled"
示例#16
0
def create_userspace(
    namespace: str,
    userspace: Dict[str, Any],
    client: dynamic.DynamicClient,
    logger: kopf.Logger,
) -> None:
    api = client.resources.get(api_version=ORBIT_API_VERSION,
                               group=ORBIT_API_GROUP,
                               kind="UserSpace")
    api.create(namespace=namespace, body=userspace)
    logger.debug(
        "Created UserSpace: %s in Namespace: %s",
        userspace["metadata"]["name"],
        namespace,
    )
def uninstall_team(namespace: str, name: str, spec: kopf.Spec, patch: kopf.Patch, logger: kopf.Logger, **_: Any) -> str:
    logger.info("In UNINSTALL_TEAM  Teamspace Controller")

    # spec:
    # env: ${env_name}
    # space: team
    # team: ${team}
    team_spec = spec.get("team", None)
    logger.info(f"Preparing to Destroy all resources in team namespace {namespace}")
    if team_spec:
        _remove_team_resources(namespace=namespace, team_spec=team_spec, logger=logger)
        _remove_user_namespaces(namespace=namespace, team_spec=team_spec, logger=logger)
        patch["status"] = {"teamspaceOperator": {"status": "DeleteProcessed"}}
    else:
        logging.warn("Team spec not found...moving on")
    return "Uninstalled"
def configure(settings: kopf.OperatorSettings, logger: kopf.Logger,
              **_: Any) -> None:
    settings.admission.server = kopf.WebhookServer(
        cafile="/certs/ca.crt",
        certfile="/certs/tls.crt",
        pkeyfile="/certs/tls.key",
        port=443,
    )
    settings.persistence.progress_storage = kopf.MultiProgressStorage([
        kopf.AnnotationsProgressStorage(prefix="orbit.aws"),
        kopf.StatusProgressStorage(field="status.orbit-aws"),
    ])
    settings.persistence.finalizer = "imagereplication-pod-webhook.orbit.aws/kopf-finalizer"
    settings.posting.level = logging.getLevelName(
        os.environ.get("EVENT_LOG_LEVEL", "INFO"))

    global CONFIG
    CONFIG = imagereplication_utils.get_config()
    logger.info("CONFIG: %s", CONFIG)
def _uninstall_chart(helm_release: str, namespace: str,
                     logger: kopf.Logger) -> bool:
    install_status = True
    cmd = f"/usr/local/bin/helm uninstall --debug --namespace {namespace} {helm_release}"
    try:
        logger.debug("running uninstall cmd: %s", cmd)
        output = run_command(cmd)
        logger.debug(output)
        logger.info("finished uninstall cmd: %s", cmd)
    except Exception:
        logger.error("errored cmd: %s", cmd)
        install_status = False
    return install_status
def _delete_user_efs_endpoint(user_name: str, user_namespace: str,
                              logger: kopf.Logger, meta: kopf.Meta) -> None:
    efs = boto3.client("efs")

    logger.info(
        f"Fetching the EFS access point in the namespace {user_namespace} for user {user_name}"
    )

    efs_access_point_id = meta.get("labels", {}).get("userEfsApId", None)

    logger.info(
        f"Deleting the EFS access point {efs_access_point_id} for user {user_name}"
    )

    try:
        efs.delete_access_point(AccessPointId=efs_access_point_id)
        logger.info(f"Access point {efs_access_point_id} deleted")
    except efs.exceptions.AccessPointNotFound:
        logger.warning(f"Access point not found: {efs_access_point_id}")
    except efs.exceptions.InternalServerError as e:
        logger.warning(e)
示例#21
0
def replication_checker(
    spec: kopf.Spec,
    status: kopf.Status,
    patch: kopf.Patch,
    logger: kopf.Logger,
    **_: Any,
) -> str:
    if status.get("replication", None) is not None:
        return cast(str, status["replication"].get("replicationStatus",
                                                   "Unknown"))

    replication = {}
    if imagereplication_utils.image_replicated(image=spec["destination"],
                                               logger=logger):
        logger.info("Skipped: Image previously replicated to ECR")
        replication["replicationStatus"] = "ECRImageExists"
    else:
        logger.info("Starting Replication")
        replication["replicationStatus"] = "Pending"

    patch["status"] = {"replication": replication}
    return replication["replicationStatus"]
def update_pod_images(
    spec: kopf.Spec,
    patch: kopf.Patch,
    dryrun: bool,
    logger: kopf.Logger,
    imagereplications_idx: kopf.Index[str, str],
    **_: Any,
) -> kopf.Patch:
    if dryrun:
        logger.debug("DryRun - Skip Pod Mutation")
        return patch

    annotations = {}
    init_containers: List[Dict[str, Any]] = []
    containers: List[Dict[str, Any]] = []
    replications = {}

    def process_containers(src_containers: Optional[List[Dict[str, Any]]],
                           dest_containers: List[Dict[str, Any]]) -> None:
        for container in src_containers if src_containers else []:
            image = container.get("image", "")
            desired_image = imagereplication_utils.get_desired_image(
                image=image, config=CONFIG)
            if image != desired_image:
                container_copy = deepcopy(container)
                container_copy["image"] = desired_image
                dest_containers.append(container_copy)
                replications[image] = desired_image
                annotations[
                    f"original-container-image~1{container['name']}"] = image

    process_containers(spec.get("initContainers", []), init_containers)
    process_containers(spec.get("containers", []), containers)

    if replications:
        client = dynamic_client()
        for source, destination in replications.items():
            if not imagereplications_idx.get(destination, []):
                imagereplication_utils.create_imagereplication(
                    namespace="orbit-system",
                    source=source,
                    destination=destination,
                    client=client,
                    logger=logger,
                )
            else:
                logger.debug("Skipping ImageReplication Creation")

    if annotations:
        patch["metadata"] = {"annotations": annotations}
        patch["spec"] = {}
        if init_containers:
            patch["spec"]["initContainers"] = init_containers
        if containers:
            patch["spec"]["containers"] = containers

    logger.debug("Patch: %s", str(patch))
    return patch
def create_poddefaults(
    namespace: str,
    name: str,
    labels: kopf.Labels,
    spec: kopf.Spec,
    status: kopf.Status,
    patch: kopf.Patch,
    logger: kopf.Logger,
    namespaces_idx: kopf.Index[str, Dict[str, Any]],
    **_: Any,
) -> str:
    team = labels.get("orbit/team", None)
    if team is None:
        logger.error("Missing required orbit/team label")
        patch["status"] = {"podDefaultsCreation": "MissingTeam"}
        return "MissingTeam"

    # Contruct a pseudo poddefault for the team to be copied to users
    poddefault = poddefault_utils.construct(
        name=name,
        desc=spec.get("desc", ""),
        labels={
            "orbit/space": "team",
            "orbit/team": team
        },
    )
    user_namespaces = [ns.get("name") for ns in namespaces_idx.get(team, [])]
    poddefault_utils.copy_poddefaults_to_user_namespaces(
        poddefaults=[poddefault],
        user_namespaces=user_namespaces,
        client=dynamic_client(),
        logger=logger,
    )

    patch["status"] = {"podDefaultsCreation": "Complete"}
    return "PodDefaultsCreated"
def _get_team_context(team: str, logger: kopf.Logger) -> Dict[str, Any]:
    try:
        api_instance = CoreV1Api()
        team_context_cf: V1ConfigMap = api_instance.read_namespaced_config_map(
            "orbit-team-context", team)
        team_context_str = team_context_cf.data["team"]

        logger.debug("team context: %s", team_context_str)
        team_context: Dict[str, Any] = json.loads(team_context_str)
        logger.debug("team context keys: %s", team_context.keys())
    except Exception as e:
        logger.error("Error during fetching team context configmap")
        raise e
    return team_context
def _remove_user_namespaces(namespace: str, team_spec: str, logger: kopf.Logger, **_: Any):  # type: ignore
    logger.info(
        f"Removing all user namespaces with labels orbit/team={team_spec},orbit/space=user in namespace {namespace} "
    )

    v1 = CoreV1Api()
    label_selector = f"orbit/team={team_spec},orbit/space=user"
    all_namespaces = v1.list_namespace(label_selector=label_selector).to_dict()

    all_ns = [
        item.get("metadata").get("name") for item in all_namespaces["items"] if item.get("metadata", {}).get("name")
    ]
    for ns in all_ns:
        logger.info(f"Calling delete namespace {ns}")
        try:
            v1.delete_namespace(name=ns, async_req=True)
        except ApiException as e:
            logger.warn("calling CoreV1API->delete_namespace had an error: %s\n" % e)
示例#26
0
def copy_poddefaults_to_user_namespaces(
    poddefaults: List[Dict[str, Any]],
    user_namespaces: List[str],
    client: DynamicClient,
    logger: kopf.Logger,
) -> None:
    logger.debug(
        "Copying PodDefaults %s to user Namespaces %s",
        [pd["metadata"]["name"] for pd in poddefaults],
        user_namespaces,
    )
    for poddefault in poddefaults:
        for namespace in user_namespaces:
            try:
                kwargs = {
                    "name": poddefault["metadata"]["name"],
                    "desc": poddefault["spec"]["desc"],
                    "labels": {
                        "orbit/space": "user",
                        "orbit/team": poddefault["metadata"]["labels"].get("orbit/team", None),
                    },
                }
                create_poddefault(
                    namespace=namespace,
                    poddefault=construct(**kwargs),
                    client=client,
                    logger=logger,
                )
            except ApiException as e:
                logger.warning(
                    "Unable to create PodDefault %s in Namespace %s: %s",
                    poddefault["metadata"]["name"],
                    namespace,
                    str(e.body),
                )
            except Exception as e:
                logger.warning(
                    "Failed to create PodDefault",
                    str(e),
                )
def _delete_custom_objects(  # type: ignore
    group: str, version: str, plural: str, namespace: str, logger: kopf.Logger, use_async=True, **_: Any
):

    logger.info(f"Deleting {plural}.{group} in ns {namespace}")
    co = CustomObjectsApi()

    try:
        resp = co.delete_collection_namespaced_custom_object(
            group=group,
            version=version,
            namespace=namespace,
            plural=plural,
            grace_period_seconds=0,
            propagation_policy="Background",
            pretty="true",
            async_req=use_async,
            body=V1DeleteOptions(),
        )

        return resp
    except ApiException as e:
        logger.warn("calling CustomObjectsApi->delete_collection_namespaced_custom_object: %s\n" % e)
        logger.warn("Assume it did not exist")
def create_job(
    namespace: str,
    name: str,
    labels: kopf.Labels,
    annotations: kopf.Annotations,
    spec: kopf.Spec,
    status: kopf.Status,
    patch: kopf.Patch,
    logger: kopf.Logger,
    namespaces_idx: kopf.Index[str, Dict[str, Any]],
    podsettings_idx: kopf.Index[Tuple[str, str], Dict[str, Any]],
    **_: Any,
) -> str:
    ns: Optional[Dict[str, Any]] = None
    for ns in namespaces_idx.get(namespace, []):
        logger.debug("ns: %s", ns)

    if ns is None:
        patch["status"] = {
            "orbitJobOperator": {
                "jobStatus": "JobCreationFailed",
                "error": "No Namespace resource found"
            }
        }
        return "JobCreationFailed"

    env = ns["env"]
    team = ns["team"]

    global ENV_CONTEXT  # Caching
    if ENV_CONTEXT is None:
        context = _load_env_context_from_ssm(env)
        if context is None:
            patch["status"] = {
                "orbitJobOperator": {
                    "jobStatus": "JobCreationFailed",
                    "error": "Unable to load Env Context from SSM"
                }
            }
            return "JobCreationFailed"
        else:
            ENV_CONTEXT = context

    node_type = spec.get("compute", {}).get("nodeType", "fargate")
    labels = {
        "app": "orbit-runner",
        "orbit/node-type": node_type,
        "notebook-name": spec.get("notebookName", ""),
        "orbit/attach-security-group": "yes" if node_type == "ec2" else "no",
    }

    podsetting_metadata: Dict[str, Any] = {}
    for podsetting_metadata in podsettings_idx.get(
        (team, spec.get("compute", {}).get("podSetting", None)), []):
        logger.debug("PodSetting: %s", podsetting_metadata)

    job_spec = job_utils.construct_job_spec(
        env=env,
        team=team,
        env_context=ENV_CONTEXT,
        podsetting_metadata=podsetting_metadata,
        orbit_job_spec=spec,
        labels=labels,
    )

    logger.debug("spec: %s", spec)
    if spec.get("schedule"):
        cronjob_id = f"orbit-{namespace}-{spec.get('triggerName')}"
        cron_job_template: V1beta1JobTemplateSpec = V1beta1JobTemplateSpec(
            spec=job_spec)
        cron_job_spec: V1beta1CronJobSpec = V1beta1CronJobSpec(
            job_template=cron_job_template, schedule=spec.get("schedule"))
        job = V1beta1CronJob(
            api_version="batch/v1beta1",
            kind="CronJob",
            metadata=V1ObjectMeta(name=cronjob_id,
                                  labels={
                                      **labels,
                                      **spec.get("compute", {}).get(
                                          "labels", {})
                                  },
                                  namespace=namespace),
            status=V1beta1CronJobStatus(),
            spec=cron_job_spec,
        )
        kopf.adopt(job, nested="spec.template")
        cron_job_instance: V1beta1CronJob = BatchV1beta1Api(
        ).create_namespaced_cron_job(namespace=namespace, body=job)
        cronjob_instance_metadata: V1ObjectMeta = cron_job_instance.metadata
        logger.debug("Started Cron Job: %s", cronjob_instance_metadata.name)
        patch["metadata"] = {"labels": {"k8sJobType": "CronJob"}}
        patch["status"] = {
            "orbitJobOperator": {
                "jobStatus": "JobCreated",
                "jobName": cronjob_instance_metadata.name,
                "nodeType": node_type,
            }
        }
        return "CronJobCreated"
    else:
        job = V1Job(
            api_version="batch/v1",
            kind="Job",
            metadata=V1ObjectMeta(labels={
                **labels,
                **spec.get("compute", {}).get("labels", {})
            }),
            spec=job_spec,
        )

        kopf.adopt(job, nested="spec.template")
        job_instance: V1Job = BatchV1Api().create_namespaced_job(
            namespace=namespace, body=job)

        job_instance_metadata: V1ObjectMeta = job_instance.metadata
        logger.debug("Started Job: %s", job_instance_metadata.name)
        patch["metadata"] = {"labels": {"k8sJobType": "Job"}}
        patch["status"] = {
            "orbitJobOperator": {
                "jobStatus": "JobCreated",
                "jobName": job_instance_metadata.name,
                "nodeType": node_type,
            }
        }
        return "JobCreated"
示例#29
0
def delete_poddefault(namespace: str, name: str, client: dynamic.DynamicClient, logger: kopf.Logger) -> None:
    api = client.resources.get(api_version=KUBEFLOW_API_VERSION, group=KUBEFLOW_API_GROUP, kind="PodDefault")
    api.delete(namespace=namespace, name=name, body={})
    logger.debug("Deleted PodDefault: %s in Namesapce: %s", name, namespace)
def orbit_cron_job_monitor(
    namespace: str,
    name: str,
    patch: kopf.Patch,
    status: kopf.Status,
    logger: kopf.Logger,
    namespaces_idx: kopf.Index[str, Dict[str, Any]],
    cron_jobs_idx: kopf.Index[Tuple[str, str], Dict[str, Any]],
    **_: Any,
) -> Any:
    ns: Optional[Dict[str, Any]] = None
    k8s_job: Optional[Dict[str, Any]] = None

    for ns in namespaces_idx.get(namespace, []):
        logger.debug("ns: %s", ns)

    if ns is None:
        patch["status"] = {
            "orbitJobOperator": {
                "jobStatus": "JobDetailsNotFound",
                "error": "No Namespace resource found"
            }
        }
        return "JobDetailsNotFound"

    logger.debug("cron_jobs_idx: %s", cron_jobs_idx)
    for k8s_job in cron_jobs_idx.get((namespace, name), []):
        logger.debug("k8s_job: %s", k8s_job)

    if k8s_job is None:  # To tackle the race condition caused by Timer
        return "JobMetadataNotFound"

    if not k8s_job.get("status", {}):
        cron_job_status = "Activating"
    else:
        cron_job_status = "Active"

    if k8s_job.get("status"):
        for i in k8s_job.get("status", {}).get("active", [{}]):
            if i.get("name") not in status.get("orbitJobOperator",
                                               {}).get("cronJobIds", []):
                cron_job_ids: List[str] = status.get("orbitJobOperator",
                                                     {}).get("cronJobIds", [])
                cron_job_ids.append(i.get("name"))
                patch["status"] = {
                    "orbitJobOperator": {
                        "jobStatus": cron_job_status,
                        "jobName": k8s_job.get("name"),
                        "cronJobIds": cron_job_ids,
                    }
                }
            else:
                return cron_job_status
    else:
        patch["status"] = {
            "orbitJobOperator": {
                "jobStatus":
                cron_job_status,
                "jobName":
                k8s_job.get("name"),
                "cronJobIds":
                status.get("orbitJobOperator", {}).get("cronJobIds", []),
            }
        }

    return cron_job_status