def publish_dataset_to_cluster(): census_data_url = 'https://s3.us-east-2.amazonaws.com/rapidsai-data/viz-data/census_data.parquet.tar.gz' data_path = "../data/census_data.parquet" check_dataset(census_data_url, data_path) # Note: The creation of a Dask LocalCluster must happen inside the `__main__` block, cluster = LocalCUDACluster(CUDA_VISIBLE_DEVICES="0") client = Client(cluster) print(f"Dask status: {cluster.dashboard_link}") # Load dataset and persist dataset on cluster def load_and_publish_dataset(): # cudf DataFrame c_df_d = delayed(load_dataset)(data_path).persist() # pandas DataFrame pd_df_d = delayed(c_df_d.to_pandas)().persist() # print(type(c_df_d)) # Unpublish datasets if present for ds_name in ['pd_df_d', 'c_df_d']: if ds_name in client.datasets: client.unpublish_dataset(ds_name) # Publish datasets to the cluster client.publish_dataset(pd_df_d=pd_df_d) client.publish_dataset(c_df_d=c_df_d) load_and_publish_dataset() # Precompute field bounds c_df_d = client.get_dataset('c_df_d') # Register top-level callback that updates plots register_update_plots_callback(client)
def publish_dataset_to_cluster(): data_path = "/home/ajay/new_dev/plotly/census_large/data/census_data_epsg_3857.parquet/*" # Note: The creation of a Dask LocalCluster must happen inside the `__main__` block, cluster = LocalCUDACluster(CUDA_VISIBLE_DEVICES="0") client = Client(cluster) print(f"Dask status: {cluster.dashboard_link}") # Load dataset and persist dataset on cluster def load_and_publish_dataset(): # cudf DataFrame c_df_d = delayed(load_dataset)(data_path).persist() # pandas DataFrame pd_df_d = delayed(c_df_d.to_pandas)().persist() # print(type(c_df_d)) # Unpublish datasets if present for ds_name in ['pd_df_d', 'c_df_d']: if ds_name in client.datasets: client.unpublish_dataset(ds_name) # Publish datasets to the cluster client.publish_dataset(pd_df_d=pd_df_d) client.publish_dataset(c_df_d=c_df_d) load_and_publish_dataset() # Precompute field bounds c_df_d = client.get_dataset('c_df_d') # Define callback to restart cluster and reload datasets @app.callback( Output('reset-gpu-complete', 'children'), [Input('reset-gpu', 'n_clicks')] ) def restart_cluster(n_clicks): if n_clicks: print("Restarting LocalCUDACluster") client.unpublish_dataset('pd_df_d') client.unpublish_dataset('c_df_d') client.restart() load_and_publish_dataset() # Register top-level callback that updates plots register_update_plots_callback(client)
from distributed import Client import cudf import time if __name__ == '__main__': client = Client('localhost:8786') print(client) # Create a simple dataframe print("Readings 'names' published dataset from another process") gdf = client.get_dataset('names') print(gdf.head())
class DaskDelegate(Delegate): type: str = "dask" def __init__(self, delegate_config: DaskDelegateConfig): super() self.delegate_config = delegate_config self.cache_provider = self.delegate_config.cache_provider # Attempt to load the global Dask client. try: self.client = get_client() except ValueError as _: if self.delegate_config.kube_cluster is not None: self.client = Client(self.delegate_config.kube_cluster) print(self.delegate_config.kube_cluster) else: self.client = Client(f"{self.delegate_config.dask_cluster_address}:{self.delegate_config.dask_cluster_port}") # Setup functions to be run on the schedule. def __scheduler_job_exists(dask_scheduler, job_id: str) -> bool: return job_id in dask_scheduler.tasks def __scheduler_job_state(dask_scheduler, job_id: str) -> TaskState: return dask_scheduler.tasks[job_id].state self.scheduler_job_exists = __scheduler_job_exists self.scheduler_job_state = __scheduler_job_state def __job_state(self, job_id: str) -> TaskState: return self.client.run_on_scheduler(self.scheduler_job_state, job_id=job_id) def connect(self) -> bool: # No need to connect. return True def test_connection(self) -> bool: # Shim this out until I figure out a good way to test a Dask and Redis connection. return True def create_job(self, job_id: str) -> bool: # No concept of creating a job. return True def start_job(self, job_id: str, work: Callable, *args, **kwargs) -> bool: if self.job_exists(job_id) or self.job_complete(job_id): return False # Parse and replace instances of the internal `result://` proxy protocol. # In short, this allows for callees to reference an in-progress or remote job without needing direct access. function_args = [(self.client.get_dataset(arg.replace("result://", "")) if isinstance(arg, str) and arg.startswith("result://") else arg) for arg in args] # Create a job to run the desired function. job_future: Future = self.client.submit(work, *function_args, **kwargs, key=job_id, pure=False) # Start additional cache job which depends on the results of the previous. cache_future: Future = self.client.submit(self.cache_provider.put, *[job_id, job_future], pure=False) # Publish the job as a dataset to maintain state across requests. self.client.publish_dataset(job_future, name=job_id, override=True) self.client.publish_dataset(cache_future, override=True) return True def stop_job(self, job_id: str) -> bool: if not self.job_exists(job_id): return False try: # Iterate through the dependencies of this job. dependencies = self.client.run_on_scheduler(lambda dask_scheduler: [(state.key) for state in dask_scheduler.tasks[id].dependencies]) # Filter out any weak depenencies. Strong dependencies are suffixed with "/" and the name of the job. dependencies = [(dependency) for dependency in dependencies if dependency.replace(id, "").startswith("/")] futures = [(Future(key)) for key in dependencies] futures.append(Future(job_id)) except KeyError: # do nothing if no dependencies pass self.client.cancel(Future(job_id)) self.client.unpublish_dataset(job_id) # Hacky fix -- Simulation processes continue executing EVEN IF the parent task is killed. def hacky(): os.system("pkill -f 'Simulation.out'") self.client.run(hacky, nanny=True) return True def job_status(self, job_id: str) -> JobStatus: # If the job is complete (results exist as a dataset or in the vault). if self.job_complete(job_id): status = JobStatus() status.status_id = JobState.DONE status.status_text = "The job is complete." status.has_failed = False status.is_done = True return status # If the job doesn't exist. if not self.job_exists(job_id): status = JobStatus() status.status_id = JobState.DOES_NOT_EXIST status.status_text = f"A job with job_id: '{job_id}' does not exist." status.has_failed = True status.is_done = False return status status_mapping = { "released": (JobState.STOPPED, "The job is known but not actively computing or in memory."), "waiting": (JobState.WAITING, "The job is waiting for dependencies to arrive in memory."), "no-worker": (JobState.WAITING, "The job is waiting for a worker to become available."), "processing": (JobState.RUNNING, "The job is running."), "memory": (JobState.DONE, "The job is done and is being held in memory."), "erred": (JobState.FAILED, "The job has failed."), "done": (JobState.DONE, "The job is done and has been cached / stored on disk.") } # Grab the task state from the scheduler. future_status = self.__job_state(job_id) status = JobStatus() status.status_id = status_mapping[future_status][0] status.status_text = status_mapping[future_status][1] status.is_done = status.status_id is JobState.DONE status.has_failed = status.status_id is JobState.FAILED return status def job_results(self, job_id: str): # The results of this job may exist on the client dataset. if job_id in self.client.datasets: print("[DEBUG] Getting results from dataset.") return self.client.get_dataset(name=job_id).result() # If the results are not in the cache, raise an exception. if not self.cache_provider.exists(job_id): raise Exception(f"Result with ID '{job_id}' does not exist in the cache.") return self.cache_provider.get(job_id) def job_complete(self, job_id: str) -> bool: # Finished job results must exist within the cache for it to be considered 'done'. return self.cache_provider.exists(job_id) def job_exists(self, job_id: str) -> bool: # Check if the job exists in the scheduler. return self.client.run_on_scheduler(self.scheduler_job_exists, job_id=job_id) def get_remote_dependency(self, dependency_id: str): # Check to see if the job exists as a dataset. dependency = self.client.get_dataset(name=dependency_id) if dependency is not None: return dependency raise Exception("Something broke, dependency does not exist within distributed memory.")
def parallel_calculate_chunks(chunks, features, approximate, training_window, verbose, save_progress, entityset, n_jobs, no_unapproximated_aggs, cutoff_df_time_var, target_time, pass_columns, dask_kwargs=None): from distributed import Client, LocalCluster, as_completed from dask.base import tokenize client = None cluster = None try: if 'cluster' in dask_kwargs: cluster = dask_kwargs['cluster'] else: diagnostics_port = None if 'diagnostics_port' in dask_kwargs: diagnostics_port = dask_kwargs['diagnostics_port'] del dask_kwargs['diagnostics_port'] workers = n_jobs_to_workers(n_jobs) workers = min(workers, len(chunks)) cluster = LocalCluster(n_workers=workers, threads_per_worker=1, diagnostics_port=diagnostics_port, **dask_kwargs) # if cluster has bokeh port, notify user if unxepected port number if diagnostics_port is not None: if hasattr(cluster, 'scheduler') and cluster.scheduler: info = cluster.scheduler.identity() if 'bokeh' in info['services']: msg = "Dashboard started on port {}" print(msg.format(info['services']['bokeh'])) client = Client(cluster) # scatter the entityset # denote future with leading underscore start = time.time() es_token = "EntitySet-{}".format(tokenize(entityset)) if es_token in client.list_datasets(): print("Using EntitySet persisted on the cluster as dataset %s" % (es_token)) _es = client.get_dataset(es_token) else: _es = client.scatter([entityset])[0] client.publish_dataset(**{_es.key: _es}) # save features to a tempfile and scatter it pickled_feats = cloudpickle.dumps(features) _saved_features = client.scatter(pickled_feats) client.replicate([_es, _saved_features]) end = time.time() scatter_time = end - start scatter_string = "EntitySet scattered to workers in {:.3f} seconds" print(scatter_string.format(scatter_time)) # map chunks # TODO: consider handling task submission dask kwargs _chunks = client.map(calculate_chunk, chunks, features=_saved_features, entityset=_es, approximate=approximate, training_window=training_window, profile=False, verbose=False, save_progress=save_progress, no_unapproximated_aggs=no_unapproximated_aggs, cutoff_df_time_var=cutoff_df_time_var, target_time=target_time, pass_columns=pass_columns) feature_matrix = [] iterator = as_completed(_chunks).batches() if verbose: pbar_str = ("Elapsed: {elapsed} | Remaining: {remaining} | " "Progress: {l_bar}{bar}| " "Calculated: {n}/{total} chunks") pbar = make_tqdm_iterator(total=len(_chunks), bar_format=pbar_str) for batch in iterator: results = client.gather(batch) for result in results: feature_matrix.append(result) if verbose: pbar.update() if verbose: pbar.close() except Exception: raise finally: if 'cluster' not in dask_kwargs and cluster is not None: cluster.close() if client is not None: client.close() return feature_matrix
from distributed import Client from dask_configuration import dask_scheduler_url # connect to dask client = Client(dask_scheduler_url) temp_cube=client.get_dataset('temp_surface') mean_temp=temp_cube.mean() print(mean_temp)