def deploy_function(function: DaskCluster, secrets=None): # TODO: why is this here :| try: import dask from dask.distributed import Client, default_client # noqa: F401 from dask_kubernetes import KubeCluster, make_pod_spec # noqa: F401 from kubernetes_asyncio import client except ImportError as exc: print( "missing dask or dask_kubernetes, please run " '"pip install dask distributed dask_kubernetes", %s', exc, ) raise exc spec = function.spec meta = function.metadata spec.remote = True image = function.full_image_path() or "daskdev/dask:latest" env = spec.env namespace = meta.namespace or config.namespace if spec.extra_pip: env.append(spec.extra_pip) pod_labels = get_resource_labels(function, scrape_metrics=config.scrape_metrics) worker_args = ["dask-worker", "--nthreads", str(spec.nthreads)] memory_limit = spec.resources.get("limits", {}).get("memory") if memory_limit: worker_args.extend(["--memory-limit", str(memory_limit)]) if spec.args: worker_args.extend(spec.args) scheduler_args = ["dask-scheduler"] container_kwargs = { "name": "base", "image": image, "env": env, "image_pull_policy": spec.image_pull_policy, "volume_mounts": spec.volume_mounts, } scheduler_container = client.V1Container( resources=spec.scheduler_resources, args=scheduler_args, **container_kwargs) worker_container = client.V1Container(resources=spec.worker_resources, args=worker_args, **container_kwargs) scheduler_pod_spec = kube_resource_spec_to_pod_spec( spec, scheduler_container) worker_pod_spec = kube_resource_spec_to_pod_spec(spec, worker_container) for pod_spec in [scheduler_pod_spec, worker_pod_spec]: if spec.image_pull_secret: pod_spec.image_pull_secrets = [ client.V1LocalObjectReference(name=spec.image_pull_secret) ] scheduler_pod = client.V1Pod( metadata=client.V1ObjectMeta(namespace=namespace, labels=pod_labels), # annotations=meta.annotation), spec=scheduler_pod_spec, ) worker_pod = client.V1Pod( metadata=client.V1ObjectMeta(namespace=namespace, labels=pod_labels), # annotations=meta.annotation), spec=worker_pod_spec, ) svc_temp = dask.config.get("kubernetes.scheduler-service-template") if spec.service_type or spec.node_port: if spec.node_port: spec.service_type = "NodePort" svc_temp["spec"]["ports"][1]["nodePort"] = spec.node_port update_in(svc_temp, "spec.type", spec.service_type) norm_name = normalize_name(meta.name) dask.config.set({ "kubernetes.scheduler-service-template": svc_temp, "kubernetes.name": "mlrun-" + norm_name + "-{uuid}", }) cluster = KubeCluster( worker_pod, scheduler_pod_template=scheduler_pod, deploy_mode="remote", namespace=namespace, idle_timeout=spec.scheduler_timeout, ) logger.info( f"cluster {cluster.name} started at {cluster.scheduler_address}") function.status.scheduler_address = cluster.scheduler_address function.status.cluster_name = cluster.name if spec.service_type == "NodePort": ports = cluster.scheduler.service.spec.ports function.status.node_ports = { "scheduler": ports[0].node_port, "dashboard": ports[1].node_port, } if spec.replicas: cluster.scale(spec.replicas) else: cluster.adapt(minimum=spec.min_replicas, maximum=spec.max_replicas) return cluster
def deploy_function(function: DaskCluster, secrets=None): try: from dask_kubernetes import KubeCluster, make_pod_spec from dask.distributed import Client, default_client from kubernetes_asyncio import client import dask except ImportError as e: print( 'missing dask or dask_kubernetes, please run ' '"pip install dask distributed dask_kubernetes", %s', e) raise e spec = function.spec meta = function.metadata spec.remote = True image = function.full_image_path() or 'daskdev/dask:latest' env = spec.env namespace = meta.namespace or config.namespace if spec.extra_pip: env.append(spec.extra_pip) pod_labels = get_resource_labels(function) args = ['dask-worker', "--nthreads", str(spec.nthreads)] if spec.args: args += spec.args container = client.V1Container(name='base', image=image, env=env, args=args, image_pull_policy=spec.image_pull_policy, volume_mounts=spec.volume_mounts, resources=spec.resources) pod_spec = client.V1PodSpec(containers=[container], restart_policy='Never', volumes=spec.volumes, service_account=spec.service_account) if spec.image_pull_secret: pod_spec.image_pull_secrets = [ client.V1LocalObjectReference(name=spec.image_pull_secret) ] pod = client.V1Pod( metadata=client.V1ObjectMeta(namespace=namespace, labels=pod_labels), #annotations=meta.annotation), spec=pod_spec) svc_temp = dask.config.get("kubernetes.scheduler-service-template") if spec.service_type or spec.node_port: if spec.node_port: spec.service_type = 'NodePort' svc_temp['spec']['ports'][1]['nodePort'] = spec.node_port update_in(svc_temp, 'spec.type', spec.service_type) norm_name = normalize_name(meta.name) dask.config.set({ "kubernetes.scheduler-service-template": svc_temp, 'kubernetes.name': 'mlrun-' + norm_name + '-{uuid}' }) cluster = KubeCluster(pod, deploy_mode='remote', namespace=namespace, scheduler_timeout=spec.scheduler_timeout) logger.info('cluster {} started at {}'.format(cluster.name, cluster.scheduler_address)) function.status.scheduler_address = cluster.scheduler_address function.status.cluster_name = cluster.name if spec.service_type == 'NodePort': ports = cluster.scheduler.service.spec.ports function.status.node_ports = { 'scheduler': ports[0].node_port, 'dashboard': ports[1].node_port } if spec.replicas: cluster.scale(spec.replicas) else: cluster.adapt(minimum=spec.min_replicas, maximum=spec.max_replicas) return cluster
image='holdenk/dask:latest', memory_limit='8G', memory_request='8G', cpu_limit=1, cpu_request=1, extra_container_config={"imagePullPolicy": "Always"}) scheduler_template = make_pod_spec( image='holdenk/dask:latest', memory_limit='4G', memory_request='4G', cpu_limit=1, cpu_request=1, extra_container_config={"imagePullPolicy": "Always"}) cluster = KubeCluster(pod_template=worker_template, scheduler_pod_template=scheduler_template) cluster.adapt( minimum=1) # or create and destroy workers dynamically based on workload from dask.distributed import Client client = Client(cluster) #end::make_dask_k8s_client[] # In[ ]: client # In[ ]: client.dashboard_link # In[ ]:
class DaskCluster(KubejobRuntime): kind = 'dask' _is_nested = False def __init__(self, spec=None, metadata=None): super().__init__(spec, metadata) self._cluster = None self.spec.build.base_image = self.spec.build.base_image or 'daskdev/dask:latest' self.set_label('mlrun/class', self.kind) @property def spec(self) -> DaskSpec: return self._spec @spec.setter def spec(self, spec): self._spec = self._verify_dict(spec, 'spec', DaskSpec) def to_pod(self): image = self._image_path() or 'daskdev/dask:latest' env = self.spec.env namespace = self.metadata.namespace or config.namespace if self.spec.extra_pip: env.append(self.spec.extra_pip) container = client.V1Container(name='base', image=image, env=env, command=None, args=self.spec.args, image_pull_policy=self.spec.image_pull_policy, volume_mounts=self.spec.volume_mounts, resources=self.spec.resources) pod_spec = client.V1PodSpec(containers=[container], restart_policy='Never', volumes=self.spec.volumes, service_account=self.spec.service_account) meta = client.V1ObjectMeta(namespace=namespace, labels=self.metadata.labels, annotations=self.metadata.annotations) pod = client.V1Pod(metadata=meta, spec=pod_spec) return pod @property def initialized(self): return True if self._cluster else False def cluster(self, scale=0): if not self._cluster: try: from dask_kubernetes import KubeCluster from dask.distributed import Client except ImportError as e: print('missing dask_kubernetes, please run "pip install dask_kubernetes"') raise e self._cluster = KubeCluster(self.to_pod()) if not scale: self._cluster.adapt() else: self._cluster.scale(scale) Client(self._cluster) return self._cluster @property def client(self): from dask.distributed import Client, default_client try: return default_client() except ValueError: if self._cluster: return Client(self._cluster) return Client() def close(self): from dask.distributed import Client, default_client, as_completed try: client = default_client() client.close() except ValueError: pass if self._cluster: self._cluster.close() def _run(self, runobj: RunObject, execution): handler = runobj.spec.handler self._force_handler(handler) from dask import delayed if self.spec.rundb: # todo: remote dask via k8s spec env environ['MLRUN_DBPATH'] = self.spec.rundb arg_list = get_func_arg(handler, runobj, execution) try: task = delayed(handler)(*arg_list) out = task.compute() except Exception as e: err = str(e) execution.set_state(error=err) if out: execution.log_result('return', out) return execution.to_dict() def _run_many(self, tasks, execution, runobj: RunObject): handler = runobj.spec.handler self._force_handler(handler) futures = [] contexts = [] tasks = list(tasks) for task in tasks: ctx = MLClientCtx.from_dict(task.to_dict(), self.spec.rundb, autocommit=True) args = get_func_arg(handler, task, ctx) resp = self.client.submit(handler, *args) futures.append(resp) contexts.append(ctx) resps = self.client.gather(futures) results = RunList() for r, c, t in zip(resps, contexts, tasks): if r: c.log_result('return', r) # todo: handle task errors resp = self._post_run(task=t) results.append(resp) print(resps) return results
def fetch( self, request_params, axis_params, start_dt, end_dt, download=False, download_format='netcdf', status_dict={}, max_nfiles=50, max_partition_sizes={ 'netcdf': '100MB', 'csv': '10MB' }, ): self.update_state( state="PROGRESS", meta=status_dict, ) ds_list = get_delayed_ds(request_params, axis_params) status_dict.update({"msg": f"{len(request_params)} datasets requested."}) self.update_state(state="PROGRESS", meta=status_dict) max_data_size = np.sum([v['total_size'] for v in ds_list.values()]) max_mem_size = max_data_size / 1024**3 dask_spec = {'min_workers': 1, 'max_workers': 2} data_threshold = os.environ.get('DATA_THRESHOLD', 50) client = None cluster = None if max_mem_size > data_threshold: image_repo, image_name, image_tag = ( 'cormorack', 'cava-dask', '20210610', ) desired_image = os.environ.get( "DASK_DOCKER_IMAGE", f"{image_repo}/{image_name}:{image_tag}") match = re.match(r"(.+)/(.+):(.+)", desired_image) if match is not None: image_repo, image_name, image_tag = match.groups() dask_spec = determine_workers( max_mem_size, image_repo=image_repo, image_name=image_name, image_tag=image_tag, ) status_dict.update({ "msg": f"Setting up distributed computing cluster. Max data size: {memory_repr(max_data_size)}" }) self.update_state(state="PROGRESS", meta=status_dict) cluster = KubeCluster( dask_spec['pod_spec'], n_workers=dask_spec['min_workers'], ) cluster.adapt(minimum=dask_spec['min_workers'], maximum=dask_spec['max_workers']) client = Client(cluster) # TODO: Need to add other parameters for multidimensional # need a check for nutnr,pco2,ph,optaa add int_ctd_pressure # parameters.append("int_ctd_pressure") # for spikr # parameters.append("spectra") status_dict.update({"msg": "Retrieving data from zarr store ..."}) self.update_state(state="PROGRESS", meta=status_dict) data_list = { k: v['dataset'].sel(time=(start_dt, end_dt)).dataset for k, v in ds_list.items() } status_dict.update({"msg": "Validating datasets..."}) self.update_state(state="PROGRESS", meta=status_dict) if any(True for v in data_list.values() if v is None): # Checks if data_list is None status_dict.update( {"msg": "One of the dataset does not contain data."}) self.update_state(state="PROGRESS", meta=status_dict) time.sleep(2) result = None elif any(True for v in data_list.values() if len(v.time) == 0): empty_streams = [] for k, v in data_list.items(): if len(v.time) == 0: empty_streams.append(k) # Checks if data_list is None status_dict.update( {"msg": f"Empty data stream(s) found: {','.join(empty_streams)}."}) self.update_state(state="PROGRESS", meta=status_dict) time.sleep(2) status_dict.update({ "msg": "Plot creation is not possible with specified parameters. Please try again." }) self.update_state(state="PROGRESS", meta=status_dict) time.sleep(2) result = None else: total_requested_size = np.sum( np.fromiter((v.nbytes for v in data_list.values()), dtype=int)) status_dict.update({ "msg": f"There are {memory_repr(total_requested_size)} of data to be processed." }) self.update_state(state="PROGRESS", meta=status_dict) if len(data_list.keys()) > 1: merged = _merge_datasets(data_list, start_dt, end_dt) else: merged = next(ds for _, ds in data_list.items()) data_count = len(merged.time) if data_count == 0: status_dict.update( {"msg": "Merged dataset does not contain data."}) self.update_state(state="PROGRESS", meta=status_dict) result = None elif data_count > 0 and download: status_dict.update({"msg": "Preparing dataset for download..."}) self.update_state(state="PROGRESS", meta=status_dict) format_ext = {'netcdf': 'nc', 'csv': 'csv'} start_dt_str = parser.parse(start_dt).strftime('%Y%m%dT%H%M%S') end_dt_str = parser.parse(end_dt).strftime('%Y%m%dT%H%M%S') dstring = f"{start_dt_str}_{end_dt_str}" continue_download = True if download_format == 'csv': ddf = merged.to_dask_dataframe().repartition( partition_size=max_partition_sizes[download_format]) # Max npartitions to 50 if ddf.npartitions > max_nfiles: message = "The amount of data to be downloaded is too large for CSV data format. Please make a smaller request." result = { "file_url": None, "msg": message, } continue_download = False else: ncfile = dstring outglob = os.path.join(ncfile, f'*.{format_ext[download_format]}') ddf.to_csv(outglob, index=False) elif download_format == 'netcdf': max_chunk_size = dask.utils.parse_bytes( max_partition_sizes[download_format]) smallest_chunk = math.ceil(merged.time.shape[0] / (merged.nbytes / max_chunk_size)) slices = [ (i, i + smallest_chunk) for i in range(0, merged.time.shape[0], smallest_chunk) ] # Max npartitions to 50 if len(slices) > max_nfiles: message = "The amount of data to be downloaded is too large for NetCDF data format. Please make a smaller request." result = { "file_url": None, "msg": message, } continue_download = False else: if len(slices) == 1: ncfile = f"{dstring}.{format_ext[download_format]}" merged.to_netcdf(ncfile) else: ncfile = dstring outglob = os.path.join( ncfile, f'*.{format_ext[download_format]}') if not os.path.exists(ncfile): os.mkdir(ncfile) for idx, sl in enumerate(slices): nc_name = f"{idx}.nc" part_ds = merged.isel(time=slice(*sl)) part_ds.to_netcdf(os.path.join(ncfile, nc_name)) if continue_download: zipname = ( f"CAVA_{datetime.utcnow().strftime('%Y%m%dT%H%M%S')}.zip") download_bucket = "ooi-data-download" cache_location = f"s3://{download_bucket}" fs = fsspec.get_mapper(cache_location).fs target_url = os.path.join(cache_location, os.path.basename(zipname)) with fs.open(target_url, mode='wb') as f: with zipfile.ZipFile( f, 'w', compression=zipfile.ZIP_DEFLATED) as zf: status_dict.update({"msg": "Creating zip file..."}) self.update_state(state="PROGRESS", meta=status_dict) zf.writestr( 'meta.yaml', yaml.dump({ 'reference_designators': request_params, 'axis_parameters': axis_params, 'start_datetime': start_dt, 'end_datetime': end_dt, }), ) if os.path.isdir(ncfile): # if ncfile is directory, # there should be an outglob variable data_files = sorted(glob.glob(outglob)) for data_file in data_files: zf.write(data_file) shutil.rmtree(ncfile) else: zf.write(ncfile) os.unlink(ncfile) download_url = f"https://{download_bucket}.s3.us-west-2.amazonaws.com/{zipname}" result = {"file_url": download_url} else: status_dict.update({"msg": "Plotting merged datasets..."}) self.update_state(state="PROGRESS", meta=status_dict) # Swapping dimensions for plotting to work if time is not # an axis selection if axis_params["x"] != "time": merged = merged.swap_dims({"time": axis_params['x']}) # Shading process final_dct, shaded, color_column = _plot_merged_dataset( merged, axis_params) x = final_dct.get(axis_params['x'], []) y = final_dct.get(axis_params['y'], []) z = [] if axis_params['z']: z = final_dct.get(axis_params['z'], np.array([])) elif shaded: z = final_dct.get(color_column, np.array([])) result = ({ "x": x, "y": y, "z": z, "count": data_count, "shaded": shaded, }, ) logger.info("Result done.") # ================ End Compute results ======================== if client is not None: # Cleans up dask client.close() if cluster is not None: cluster.close() return result