예제 #1
0
def create_cluster(**kwargs):
    workers = task['inputs'].get('workers', 0)
    cpu = task['inputs'].get('worker_cores', 2)
    memory = task['inputs'].get('worker_memory', 2)
    image = task['inputs'].get('worker_image', 'daskdev/dask:latest')

    resources = {
        'cpu': str(cpu),
        'memory': str(memory),
    }

    container = client.V1Container(
        name='dask',
        image=image,
        args=[
            'dask-worker',
            '--nthreads',
            str(cpu_to_threads(cpu)),
            '--no-bokeh',
            '--memory-limit',
            f'{memory}B',
            '--death-timeout',
            '60',
        ],
        resources=client.V1ResourceRequirements(
            limits=resources,
            requests=resources,
        ),
    )

    pod = client.V1Pod(
        metadata=client.V1ObjectMeta(labels={
            'cowait/task': 'worker-' + task.get('id'),
            'cowait/parent': task.get('id'),
        }, ),
        spec=client.V1PodSpec(
            restart_policy='Never',
            image_pull_secrets=[
                client.V1LocalObjectReference(name=secret)
                for secret in task['inputs'].get('pull_secrets', ['docker'])
            ],
            containers=[container],
        ),
    )

    return KubeCluster(
        pod_template=pod,
        n_workers=workers,
    )
예제 #2
0
def default_cluster_agent_deployment():
    """ Default cluster agent deployment """
    labels = {
        'app': 'epsagon-cluster-agent'
    }
    return client.V1Deployment(
        api_version='apps/v1',
        kind='Deployment',
        metadata=client.V1ObjectMeta(name='cluster-agent', namespace='epsagon-monitoring'),
        spec=client.V1DeploymentSpec(
            selector=client.V1LabelSelector(
                match_labels=labels.copy()
            ),
            replicas=1,
            template=client.V1PodTemplateSpec(
                metadata=client.V1ObjectMeta(labels=labels.copy()),
                spec=client.V1PodSpec(
                    service_account_name='cluster-agent',
                    containers=[
                        client.V1Container(
                            name='cluster-agent',
                            image='epsagon/cluster-agent:test',
                            # required for pulling from the docker local loaded images
                            # and not from Epsagon remote hub
                            image_pull_policy='Never',
                            env=[
                                client.V1EnvVar(name='EPSAGON_TOKEN', value='123'),
                                client.V1EnvVar(name='EPSAGON_CLUSTER_NAME', value='test'),
                                client.V1EnvVar(name='EPSAGON_DEBUG', value='false'),
                                client.V1EnvVar(name='EPSAGON_COLLECTOR_URL', value='http://localhost:5000'),
                            ]
                        ),
                    ]
                ),
            ),
        ),
    )
예제 #3
0
    async def deploy_resource(
        self, resource_attributes: AttributeDict
    ) -> AttributeDict:
        drone_environment = self.drone_environment(
            resource_attributes.drone_uuid,
            resource_attributes.obs_machine_meta_data_translation_mapping,
        )

        spec = k8s_client.V1DeploymentSpec(
            replicas=1,
            selector=k8s_client.V1LabelSelector(
                match_labels={"app": resource_attributes.drone_uuid}
            ),
            template=k8s_client.V1PodTemplateSpec(),
        )
        spec.template.metadata = k8s_client.V1ObjectMeta(
            name=resource_attributes.drone_uuid,
            labels={"app": resource_attributes.drone_uuid},
        )
        container = k8s_client.V1Container(
            image=self.machine_type_configuration.image,
            args=self.machine_type_configuration.args,
            name=resource_attributes.drone_uuid,
            resources=k8s_client.V1ResourceRequirements(
                requests={
                    "cpu": self.machine_meta_data.Cores,
                    "memory": convert_to(self.machine_meta_data.Memory * 1e09, int),
                }
            ),
            env=[
                k8s_client.V1EnvVar(name=f"TardisDrone{key}", value=str(value))
                for key, value in drone_environment.items()
            ],
        )
        spec.template.spec = k8s_client.V1PodSpec(containers=[container])
        body = k8s_client.V1Deployment(
            metadata=k8s_client.V1ObjectMeta(name=resource_attributes.drone_uuid),
            spec=spec,
        )
        response_temp = await self.client.create_namespaced_deployment(
            namespace=self.machine_type_configuration.namespace, body=body
        )
        response = {
            "uid": response_temp.metadata.uid,
            "name": response_temp.metadata.name,
            "type": "Booting",
        }
        if self.machine_type_configuration.hpa:
            spec = k8s_client.V1HorizontalPodAutoscalerSpec(
                max_replicas=self.machine_type_configuration.max_replicas,
                min_replicas=self.machine_type_configuration.min_replicas,
                target_cpu_utilization_percentage=self.machine_type_configuration.cpu_utilization,  # noqa: B950
                scale_target_ref=k8s_client.V1CrossVersionObjectReference(
                    api_version="apps/v1",
                    kind="Deployment",
                    name=resource_attributes.drone_uuid,
                ),
            )
            dep = k8s_client.V1HorizontalPodAutoscaler(
                metadata=k8s_client.V1ObjectMeta(name=resource_attributes.drone_uuid),
                spec=spec,
            )
            await self.hpa_client.create_namespaced_horizontal_pod_autoscaler(
                namespace=self.machine_type_configuration.namespace, body=dep
            )
        return self.handle_response(response)
예제 #4
0
def deploy_function(function: DaskCluster, secrets=None):
    try:
        from dask_kubernetes import KubeCluster, make_pod_spec
        from dask.distributed import Client, default_client
        from kubernetes_asyncio import client
        import dask
    except ImportError as e:
        print('missing dask or dask_kubernetes, please run '
              '"pip install dask distributed dask_kubernetes", %s', e)
        raise e

    spec = function.spec
    meta = function.metadata
    spec.remote = True

    image = function.full_image_path() or 'daskdev/dask:latest'
    env = spec.env
    namespace = meta.namespace or config.namespace
    if spec.extra_pip:
        env.append(spec.extra_pip)

    pod_labels = get_resource_labels(function)
    args = ['dask-worker', "--nthreads", str(spec.nthreads)]
    if spec.args:
        args += spec.args

    container = client.V1Container(name='base',
                                   image=image,
                                   env=env,
                                   args=args,
                                   image_pull_policy=spec.image_pull_policy,
                                   volume_mounts=spec.volume_mounts,
                                   resources=spec.resources)

    pod_spec = client.V1PodSpec(containers=[container],
                                restart_policy='Never',
                                volumes=spec.volumes,
                                service_account=spec.service_account)
    if spec.image_pull_secret:
        pod_spec.image_pull_secrets = [
            client.V1LocalObjectReference(name=spec.image_pull_secret)]

    pod = client.V1Pod(metadata=client.V1ObjectMeta(namespace=namespace,
                                                    labels=pod_labels),
                                                    #annotations=meta.annotation),
                       spec=pod_spec)

    svc_temp = dask.config.get("kubernetes.scheduler-service-template")
    if spec.service_type or spec.node_port:
        if spec.node_port:
            spec.service_type = 'NodePort'
            svc_temp['spec']['ports'][1]['nodePort'] = spec.node_port
        update_in(svc_temp, 'spec.type', spec.service_type)

    norm_name = normalize_name(meta.name)
    dask.config.set({"kubernetes.scheduler-service-template": svc_temp,
                     'kubernetes.name': 'mlrun-' + norm_name + '-{uuid}'})

    cluster = KubeCluster(
        pod, deploy_mode='remote',
        namespace=namespace,
        scheduler_timeout=spec.scheduler_timeout)

    logger.info('cluster {} started at {}'.format(
        cluster.name, cluster.scheduler_address
    ))

    function.status.scheduler_address = cluster.scheduler_address
    function.status.cluster_name = cluster.name
    if spec.service_type == 'NodePort':
        ports = cluster.scheduler.service.spec.ports
        function.status.node_ports = {'scheduler': ports[0].node_port,
                                      'dashboard': ports[1].node_port}

    if spec.replicas:
        cluster.scale(spec.replicas)
    else:
        cluster.adapt(minimum=spec.min_replicas,
                      maximum=spec.max_replicas)

    return cluster
예제 #5
0
def deploy_function(function: DaskCluster, secrets=None):

    # TODO: why is this here :|
    try:
        from dask_kubernetes import KubeCluster, make_pod_spec  # noqa: F401
        from dask.distributed import Client, default_client  # noqa: F401
        from kubernetes_asyncio import client
        import dask
    except ImportError as e:
        print(
            "missing dask or dask_kubernetes, please run "
            '"pip install dask distributed dask_kubernetes", %s',
            e,
        )
        raise e

    spec = function.spec
    meta = function.metadata
    spec.remote = True

    image = function.full_image_path() or "daskdev/dask:latest"
    env = spec.env
    namespace = meta.namespace or config.namespace
    if spec.extra_pip:
        env.append(spec.extra_pip)

    pod_labels = get_resource_labels(function, scrape_metrics=False)
    args = ["dask-worker", "--nthreads", str(spec.nthreads)]
    memory_limit = spec.resources.get("limits", {}).get("memory")
    if memory_limit:
        args.extend(["--memory-limit", str(memory_limit)])
    if spec.args:
        args.extend(spec.args)

    container = client.V1Container(
        name="base",
        image=image,
        env=env,
        args=args,
        image_pull_policy=spec.image_pull_policy,
        volume_mounts=spec.volume_mounts,
        resources=spec.resources,
    )

    pod_spec = client.V1PodSpec(
        containers=[container],
        restart_policy="Never",
        volumes=spec.volumes,
        service_account=spec.service_account,
    )
    if spec.image_pull_secret:
        pod_spec.image_pull_secrets = [
            client.V1LocalObjectReference(name=spec.image_pull_secret)
        ]

    pod = client.V1Pod(
        metadata=client.V1ObjectMeta(namespace=namespace, labels=pod_labels),
        # annotations=meta.annotation),
        spec=pod_spec,
    )

    svc_temp = dask.config.get("kubernetes.scheduler-service-template")
    if spec.service_type or spec.node_port:
        if spec.node_port:
            spec.service_type = "NodePort"
            svc_temp["spec"]["ports"][1]["nodePort"] = spec.node_port
        update_in(svc_temp, "spec.type", spec.service_type)

    norm_name = normalize_name(meta.name)
    dask.config.set(
        {
            "kubernetes.scheduler-service-template": svc_temp,
            "kubernetes.name": "mlrun-" + norm_name + "-{uuid}",
        }
    )

    cluster = KubeCluster(
        pod,
        deploy_mode="remote",
        namespace=namespace,
        scheduler_timeout=spec.scheduler_timeout,
    )

    logger.info(
        "cluster {} started at {}".format(cluster.name, cluster.scheduler_address)
    )

    function.status.scheduler_address = cluster.scheduler_address
    function.status.cluster_name = cluster.name
    if spec.service_type == "NodePort":
        ports = cluster.scheduler.service.spec.ports
        function.status.node_ports = {
            "scheduler": ports[0].node_port,
            "dashboard": ports[1].node_port,
        }

    if spec.replicas:
        cluster.scale(spec.replicas)
    else:
        cluster.adapt(minimum=spec.min_replicas, maximum=spec.max_replicas)

    return cluster
예제 #6
0
def make_pod_spec(
    image,
    labels={},
    threads_per_worker=1,
    env={},
    extra_container_config={},
    extra_pod_config={},
    memory_limit=None,
    memory_request=None,
    cpu_limit=None,
    cpu_request=None,
):
    """
    Create generic pod template from input parameters

    Examples
    --------
    >>> make_pod_spec(image='daskdev/dask:latest', memory_limit='4G', memory_request='4G')
    """
    args = [
        "dask-worker",
        "$(DASK_SCHEDULER_ADDRESS)",
        "--nthreads",
        str(threads_per_worker),
        "--death-timeout",
        "60",
    ]
    if memory_limit:
        args.extend(["--memory-limit", str(memory_limit)])
    pod = client.V1Pod(
        metadata=client.V1ObjectMeta(labels=labels),
        spec=client.V1PodSpec(
            restart_policy="Never",
            containers=[
                client.V1Container(
                    name="dask-worker",
                    image=image,
                    args=args,
                    env=[
                        client.V1EnvVar(name=k, value=v)
                        for k, v in env.items()
                    ],
                )
            ],
        ),
    )

    resources = client.V1ResourceRequirements(limits={}, requests={})

    if cpu_request:
        resources.requests["cpu"] = cpu_request
    if memory_request:
        resources.requests["memory"] = memory_request

    if cpu_limit:
        resources.limits["cpu"] = cpu_limit
    if memory_limit:
        resources.limits["memory"] = memory_limit

    pod.spec.containers[0].resources = resources

    for key, value in extra_container_config.items():
        _set_k8s_attribute(pod.spec.containers[0], key, value)

    for key, value in extra_pod_config.items():
        _set_k8s_attribute(pod.spec, key, value)
    return pod
예제 #7
0
    async def create_deployment(
        self,
        container: str,
        num_replicas: int,
        cpus: float = 1.0,
        memory: float = 1.0,
    ) -> Tuple[str, str]:
        assert self.auth_client
        assert self.cluster_endpoint

        cfg = client.Configuration(
            host=f"https://{self.cluster_endpoint}:443",
            api_key={
                "authorization": f"Bearer {await self.auth_client.get()}"
            },
        )
        cfg.verify_ssl = False

        async with ApiClient(configuration=cfg) as kube_api:
            apps_api = client.AppsV1Api(kube_api)
            core_api = client.CoreV1Api(kube_api)

            # Create deployment
            deployment_id = f"dep-{uuid.uuid4()}"
            deployment = client.V1Deployment(
                api_version="apps/v1",
                kind="Deployment",
                metadata=client.V1ObjectMeta(name=deployment_id),
                spec=client.V1DeploymentSpec(
                    replicas=num_replicas,
                    selector={"matchLabels": {
                        "dep": deployment_id
                    }},
                    template=client.V1PodTemplateSpec(
                        metadata=client.V1ObjectMeta(
                            labels={"dep": deployment_id}),
                        spec=client.V1PodSpec(containers=[
                            client.V1Container(
                                name=deployment_id,
                                env=[
                                    client.V1EnvVar(name="PORT",
                                                    value=str(INTERNAL_PORT))
                                ],
                                image=container,
                                resources=client.V1ResourceRequirements(
                                    requests={
                                        "cpu": str(cpus),
                                        "memory": f"{int(memory * 1024)}M",
                                    }),
                                ports=[
                                    client.V1ContainerPort(
                                        container_port=INTERNAL_PORT)
                                ],
                            )
                        ]),
                    ),
                ),
            )
            await apps_api.create_namespaced_deployment(
                namespace=KUBE_NAMESPACE, body=deployment)

            # Create service
            service_id = f"{deployment_id}-svc"
            service_port = self.get_unassigned_port()
            service = client.V1Service(
                api_version="v1",
                kind="Service",
                metadata=client.V1ObjectMeta(
                    name=service_id,
                    # annotations={"cloud.google.com/load-balancer-type": "Internal"},
                ),
                spec=client.V1ServiceSpec(
                    selector={"dep": deployment_id},
                    ports=[
                        client.V1ServicePort(
                            protocol="TCP",
                            port=service_port,
                            target_port=INTERNAL_PORT,
                        )
                    ],
                    type="LoadBalancer",
                ),
            )
            await core_api.create_namespaced_service(namespace=KUBE_NAMESPACE,
                                                     body=service)

            # Poll for external URL
            service_ip = None
            while not service_ip:
                await asyncio.sleep(POLL_INTERVAL)
                ingress = (await core_api.read_namespaced_service(
                    name=service_id,
                    namespace=KUBE_NAMESPACE)).status.load_balancer.ingress
                if ingress:
                    service_ip = ingress[0].ip

        service_url = f"http://{service_ip}:{service_port}"
        print(f"Started deployment {deployment_id} at {service_url}")

        return deployment_id, service_url
예제 #8
0
 def setUp(self):
     config = self.mock_config.return_value
     test_site_config = config.TestSite
     # Endpoint of Kube cluster
     test_site_config.host = "https://127.0.0.1:443"
     # Barer token we are going to use to authenticate
     test_site_config.token = "31ada4fd-adec-460c-809a-9e56ceb75269"
     test_site_config.MachineTypeConfiguration = AttributeDict(
         test2large=AttributeDict(
             namespace="default",
             image="busybox:1.26.1",
             args=["sleep", "3600"],
             hpa="True",
             min_replicas="1",
             max_replicas="2",
             cpu_utilization="50",
         )
     )
     test_site_config.MachineMetaData = AttributeDict(
         test2large=AttributeDict(Cores=2, Memory=4)
     )
     kubernetes_api = self.mock_kubernetes_api.return_value
     kubernetes_hpa = self.mock_kubernetes_hpa.return_value
     spec = client.V1DeploymentSpec(
         replicas=1,
         selector=client.V1LabelSelector(match_labels={"app": "testsite-089123"}),
         template=client.V1PodTemplateSpec(),
     )
     container = client.V1Container(
         image="busybox:1.26.1",
         args=["sleep", "3600"],
         name="testsite-089123",
         resources=client.V1ResourceRequirements(
             requests={
                 "cpu": test_site_config.MachineMetaData.test2large.Cores,
                 "memory": test_site_config.MachineMetaData.test2large.Memory * 1e9,
             }
         ),
         env=[
             client.V1EnvVar(name="TardisDroneCores", value="2"),
             client.V1EnvVar(name="TardisDroneMemory", value="4096"),
             client.V1EnvVar(name="TardisDroneUuid", value="testsite-089123"),
         ],
     )
     spec.template.metadata = client.V1ObjectMeta(
         name="testsite-089123",
         labels={"app": "testsite-089123"},
     )
     spec.template.spec = client.V1PodSpec(containers=[container])
     self.body = client.V1Deployment(
         metadata=client.V1ObjectMeta(name="testsite-089123"),
         spec=spec,
     )
     self.create_return_value = client.V1Deployment(
         metadata=client.V1ObjectMeta(name="testsite-089123", uid="123456"),
         spec=spec,
     )
     kubernetes_api.create_namespaced_deployment.return_value = async_return(
         return_value=self.create_return_value
     )
     condition_list = [
         client.V1DeploymentCondition(
             status="True",
             type="Progressing",
         )
     ]
     self.read_return_value = client.V1Deployment(
         metadata=client.V1ObjectMeta(name="testsite-089123", uid="123456"),
         spec=spec,
         status=client.V1DeploymentStatus(conditions=condition_list),
     )
     kubernetes_api.read_namespaced_deployment.return_value = async_return(
         return_value=self.read_return_value
     )
     kubernetes_api.replace_namespaced_deployment.return_value = async_return(
         return_value=None
     )
     kubernetes_api.delete_namespaced_deployment.return_value = async_return(
         return_value=None
     )
     kubernetes_hpa.create_namespaced_horizontal_pod_autoscaler.return_value = (
         async_return(return_value=None)
     )
     kubernetes_hpa.delete_namespaced_horizontal_pod_autoscaler.return_value = (
         async_return(return_value=None)
     )
     self.kubernetes_adapter = KubernetesAdapter(
         machine_type="test2large", site_name="TestSite"
     )