def parallel_calculate_chunks(chunks, features, approximate, training_window, verbose, save_progress, entityset, n_jobs, no_unapproximated_aggs, cutoff_df_time_var, target_time, pass_columns, dask_kwargs=None): from distributed import Client, LocalCluster, as_completed from dask.base import tokenize client = None cluster = None try: if 'cluster' in dask_kwargs: cluster = dask_kwargs['cluster'] else: diagnostics_port = None if 'diagnostics_port' in dask_kwargs: diagnostics_port = dask_kwargs['diagnostics_port'] del dask_kwargs['diagnostics_port'] workers = n_jobs_to_workers(n_jobs) workers = min(workers, len(chunks)) cluster = LocalCluster(n_workers=workers, threads_per_worker=1, diagnostics_port=diagnostics_port, **dask_kwargs) # if cluster has bokeh port, notify user if unxepected port number if diagnostics_port is not None: if hasattr(cluster, 'scheduler') and cluster.scheduler: info = cluster.scheduler.identity() if 'bokeh' in info['services']: msg = "Dashboard started on port {}" print(msg.format(info['services']['bokeh'])) client = Client(cluster) # scatter the entityset # denote future with leading underscore start = time.time() es_token = "EntitySet-{}".format(tokenize(entityset)) if es_token in client.list_datasets(): print("Using EntitySet persisted on the cluster as dataset %s" % (es_token)) _es = client.get_dataset(es_token) else: _es = client.scatter([entityset])[0] client.publish_dataset(**{_es.key: _es}) # save features to a tempfile and scatter it pickled_feats = cloudpickle.dumps(features) _saved_features = client.scatter(pickled_feats) client.replicate([_es, _saved_features]) end = time.time() scatter_time = end - start scatter_string = "EntitySet scattered to workers in {:.3f} seconds" print(scatter_string.format(scatter_time)) # map chunks # TODO: consider handling task submission dask kwargs _chunks = client.map(calculate_chunk, chunks, features=_saved_features, entityset=_es, approximate=approximate, training_window=training_window, profile=False, verbose=False, save_progress=save_progress, no_unapproximated_aggs=no_unapproximated_aggs, cutoff_df_time_var=cutoff_df_time_var, target_time=target_time, pass_columns=pass_columns) feature_matrix = [] iterator = as_completed(_chunks).batches() if verbose: pbar_str = ("Elapsed: {elapsed} | Remaining: {remaining} | " "Progress: {l_bar}{bar}| " "Calculated: {n}/{total} chunks") pbar = make_tqdm_iterator(total=len(_chunks), bar_format=pbar_str) for batch in iterator: results = client.gather(batch) for result in results: feature_matrix.append(result) if verbose: pbar.update() if verbose: pbar.close() except Exception: raise finally: if 'cluster' not in dask_kwargs and cluster is not None: cluster.close() if client is not None: client.close() return feature_matrix
"y_3857", "log10_range", "created", # Filtering "lat", "lon", "Description", "Status", "mcc", "net", # Hover info ]] # Persist and publish Dask dataframe in memory cell_towers_ddf = cell_towers_ddf.repartition(npartitions=8).persist() # Clear any published datasets for k in client.list_datasets(): client.unpublish_dataset(k) client.publish_dataset(cell_towers_ddf=cell_towers_ddf) data_3857 = dask.compute( [cell_towers_ddf["x_3857"].min(), cell_towers_ddf["y_3857"].min()], [cell_towers_ddf["x_3857"].max(), cell_towers_ddf["y_3857"].max()], ) data_center_3857 = [[ (data_3857[0][0] + data_3857[1][0]) / 2.0, (data_3857[0][1] + data_3857[1][1]) / 2.0, ]] data_4326 = epsg_3857_to_4326(data_3857) data_center_4326 = epsg_3857_to_4326(data_center_3857)