def default(glyph, source, schema, canvas, summary): create, info, append, _, finalize = compile_components( summary, schema, glyph) x_mapper = canvas.x_axis.mapper y_mapper = canvas.y_axis.mapper extend = glyph._build_extend(x_mapper, y_mapper, info, append) x_range = canvas.x_range or glyph.compute_x_bounds(source) y_range = canvas.y_range or glyph.compute_y_bounds(source) width = canvas.plot_width height = canvas.plot_height x_st = canvas.x_axis.compute_scale_and_translate(x_range, width) y_st = canvas.y_axis.compute_scale_and_translate(y_range, height) x_axis = canvas.x_axis.compute_index(x_st, width) y_axis = canvas.y_axis.compute_index(y_st, height) bases = create((height, width)) extend(bases, source, x_st + y_st, x_range + y_range) return finalize(bases, coords=OrderedDict([(glyph.x_label, x_axis), (glyph.y_label, y_axis)]), dims=[glyph.y_label, glyph.x_label])
def line(glyph, df, schema, canvas, summary, cuda=False): if cuda: from cudf import concat else: from pandas import concat shape, bounds, st, axis = shape_bounds_st_and_axis(df, canvas, glyph) # Compile functions create, info, append, combine, finalize = \ compile_components(summary, schema, glyph, cuda=cuda) x_mapper = canvas.x_axis.mapper y_mapper = canvas.y_axis.mapper extend = glyph._build_extend(x_mapper, y_mapper, info, append) def chunk(df, df2=None): plot_start = True if df2 is not None: df = concat([df.iloc[-1:], df2]) plot_start = False aggs = create(shape) extend(aggs, df, st, bounds, plot_start=plot_start) return aggs name = tokenize(df.__dask_tokenize__(), canvas, glyph, summary) old_name = df.__dask_tokenize__() dsk = {(name, 0): (chunk, (old_name, 0))} for i in range(1, df.npartitions): dsk[(name, i)] = (chunk, (old_name, i - 1), (old_name, i)) keys2 = [(name, i) for i in range(df.npartitions)] dsk[name] = (apply, finalize, [(combine, keys2)], dict(cuda=cuda, coords=axis, dims=[glyph.y_label, glyph.x_label])) return dsk, name
def default(glyph, df, schema, canvas, summary): shape, bounds, st, axis = shape_bounds_st_and_axis(df, canvas, glyph) # Compile functions create, info, append, combine, finalize = \ compile_components(summary, schema, glyph) x_mapper = canvas.x_axis.mapper y_mapper = canvas.y_axis.mapper extend = glyph._build_extend(x_mapper, y_mapper, info, append) def chunk(df): aggs = create(shape) extend(aggs, df, st, bounds) return aggs name = tokenize(df.__dask_tokenize__(), canvas, glyph, summary) keys = df.__dask_keys__() keys2 = [(name, i) for i in range(len(keys))] dsk = dict((k2, (chunk, k)) for (k2, k) in zip(keys2, keys)) dsk[name] = (apply, finalize, [(combine, keys2)], dict(coords=axis, dims=[glyph.y_label, glyph.x_label])) return dsk, name
def dask_rectilinear(glyph, xr_ds, schema, canvas, summary, cuda): shape, bounds, st, axis = shape_bounds_st_and_axis(xr_ds, canvas, glyph) # Compile functions create, info, append, combine, finalize = \ compile_components(summary, schema, glyph, cuda=cuda) x_mapper = canvas.x_axis.mapper y_mapper = canvas.y_axis.mapper extend = glyph._build_extend(x_mapper, y_mapper, info, append) # Build chunk indices for coordinates chunk_inds = {} for k, chunks in xr_ds.chunks.items(): chunk_inds[k] = [0] + list(np.cumsum(chunks)) x_name = glyph.x y_name = glyph.y coords = xr_ds[glyph.name].coords coord_dims = list(coords.dims) xdim_ind = coord_dims.index(x_name) ydim_ind = coord_dims.index(y_name) var_name = list(xr_ds.data_vars.keys())[0] # Compute interval breaks xs = xr_ds[x_name].values ys = xr_ds[y_name].values x_breaks = glyph.infer_interval_breaks(xs) y_breaks = glyph.infer_interval_breaks(ys) def chunk(np_arr, *inds): # Reconstruct dataset for chunk from numpy array and chunk indices chunk_coords_list = [] # for i, (coord_name, coord_vals) in enumerate(coords.items()): for i, coord_name in enumerate(coords.dims): chunk_number = inds[i] coord_slice = slice(chunk_inds[coord_name][chunk_number], chunk_inds[coord_name][chunk_number + 1]) chunk_coords_list.append( [coord_name, coords[coord_name][coord_slice]]) chunk_coords = OrderedDict(chunk_coords_list) chunk_ds = xr.DataArray(np_arr, coords=chunk_coords, dims=coord_dims, name=var_name).to_dataset() # Compute chunk x/y breaks x_chunk_number = inds[xdim_ind] x_breaks_slice = slice(chunk_inds[x_name][x_chunk_number], chunk_inds[x_name][x_chunk_number + 1] + 1) x_breaks_chunk = x_breaks[x_breaks_slice] y_chunk_number = inds[ydim_ind] y_breaks_slice = slice(chunk_inds[y_name][y_chunk_number], chunk_inds[y_name][y_chunk_number + 1] + 1) y_breaks_chunk = y_breaks[y_breaks_slice] # Initialize aggregation buffers aggs = create(shape) # Perform aggregation extend(aggs, chunk_ds, st, bounds, x_breaks=x_breaks_chunk, y_breaks=y_breaks_chunk) return aggs name = tokenize(xr_ds.__dask_tokenize__(), canvas, glyph, summary) keys = [k for row in xr_ds.__dask_keys__()[0] for k in row] keys2 = [(name, i) for i in range(len(keys))] dsk = dict((k2, (chunk, k, k[1], k[2])) for (k2, k) in zip(keys2, keys)) dsk[name] = (apply, finalize, [(combine, keys2)], dict(cuda=cuda, coords=axis, dims=[glyph.y_label, glyph.x_label])) return dsk, name
def dask_curvilinear(glyph, xr_ds, schema, canvas, summary, cuda): shape, bounds, st, axis = shape_bounds_st_and_axis(xr_ds, canvas, glyph) # Compile functions create, info, append, combine, finalize = \ compile_components(summary, schema, glyph, cuda=cuda) x_mapper = canvas.x_axis.mapper y_mapper = canvas.y_axis.mapper extend = glyph._build_extend(x_mapper, y_mapper, info, append) x_coord_name = glyph.x y_coord_name = glyph.y z_name = glyph.name data_dim_names = list(xr_ds[z_name].dims) x_coord_dim_names = list(xr_ds[x_coord_name].dims) y_coord_dim_names = list(xr_ds[y_coord_name].dims) zs = xr_ds[z_name].data x_centers = xr_ds[glyph.x].data y_centers = xr_ds[glyph.y].data var_name = list(xr_ds.data_vars.keys())[0] # Validate coordinates err_msg = ( "DataArray {name} is backed by a Dask array, \n" "but coordinate {coord} is not backed by a Dask array with identical \n" "dimension order and chunks") if (not isinstance(x_centers, dask.array.Array) or xr_ds[glyph.name].dims != xr_ds[glyph.x].dims or xr_ds[glyph.name].chunks != xr_ds[glyph.x].chunks): raise ValueError(err_msg.format(name=glyph.name, coord=glyph.x)) if (not isinstance(y_centers, dask.array.Array) or xr_ds[glyph.name].dims != xr_ds[glyph.y].dims or xr_ds[glyph.name].chunks != xr_ds[glyph.y].chunks): raise ValueError(err_msg.format(name=glyph.name, coord=glyph.y)) # Make sure coordinates are floats so that overlap with nan will behave properly if x_centers.dtype.kind != 'f': x_centers = x_centers.astype(np.float64) if y_centers.dtype.kind != 'f': y_centers = y_centers.astype(np.float64) x_overlapped_centers = overlap(x_centers, depth=1, boundary=np.nan) y_overlapped_centers = overlap(y_centers, depth=1, boundary=np.nan) def chunk(np_zs, np_x_centers, np_y_centers): # Handle boundaries that have nothing to overlap with for centers in [np_x_centers, np_y_centers]: if np.isnan(centers[0, :]).all(): centers[0, :] = centers[1, :] - (centers[2, :] - centers[1, :]) if np.isnan(centers[-1, :]).all(): centers[-1, :] = centers[-2, :] + (centers[-2, :] - centers[-3, :]) if np.isnan(centers[:, 0]).all(): centers[:, 0] = centers[:, 1] - (centers[:, 2] - centers[:, 1]) if np.isnan(centers[:, -1]).all(): centers[:, -1] = centers[:, -2] + (centers[:, -2] - centers[:, -3]) # compute interval breaks x_breaks_chunk = glyph.infer_interval_breaks(np_x_centers) y_breaks_chunk = glyph.infer_interval_breaks(np_y_centers) # trim breaks x_breaks_chunk = x_breaks_chunk[1:-1, 1:-1] y_breaks_chunk = y_breaks_chunk[1:-1, 1:-1] # Reconstruct dataset for chunk from numpy array and chunk indices chunk_coords = { x_coord_name: (x_coord_dim_names, np_x_centers[1:-1, 1:-1]), y_coord_name: (y_coord_dim_names, np_y_centers[1:-1, 1:-1]), } chunk_ds = xr.DataArray(np_zs, coords=chunk_coords, dims=data_dim_names, name=var_name).to_dataset() # Initialize aggregation buffers aggs = create(shape) # Perform aggregation extend(aggs, chunk_ds, st, bounds, x_breaks=x_breaks_chunk, y_breaks=y_breaks_chunk) return aggs result_name = tokenize(xr_ds.__dask_tokenize__(), canvas, glyph, summary) z_keys = [k for row in zs.__dask_keys__() for k in row] x_overlap_keys = [ k for row in x_overlapped_centers.__dask_keys__() for k in row ] y_overlap_keys = [ k for row in y_overlapped_centers.__dask_keys__() for k in row ] result_keys = [(result_name, i) for i in range(len(z_keys))] dsk = dict( (res_k, (chunk, z_k, x_k, y_k)) for (res_k, z_k, x_k, y_k) in zip(result_keys, z_keys, x_overlap_keys, y_overlap_keys)) dsk[result_name] = (apply, finalize, [(combine, result_keys)], dict(cuda=cuda, coords=axis, dims=[glyph.y_label, glyph.x_label])) # Add x/y coord tasks to task graph dsk.update(x_overlapped_centers.dask) dsk.update(y_overlapped_centers.dask) return dsk, result_name
def dask_raster(glyph, xr_ds, schema, canvas, summary, cuda): shape, bounds, st, axis = shape_bounds_st_and_axis(xr_ds, canvas, glyph) # Compile functions create, info, append, combine, finalize = \ compile_components(summary, schema, glyph, cuda=cuda) x_mapper = canvas.x_axis.mapper y_mapper = canvas.y_axis.mapper extend = glyph._build_extend(x_mapper, y_mapper, info, append) # Build chunk indices for coordinates chunk_inds = {} for k, chunks in xr_ds.chunks.items(): chunk_inds[k] = [0] + list(np.cumsum(chunks)) x_name = glyph.x y_name = glyph.y coords = xr_ds[glyph.name].coords coord_dims = list(coords.dims) xdim_ind = coord_dims.index(x_name) ydim_ind = coord_dims.index(y_name) var_name = list(xr_ds.data_vars.keys())[0] # Pre-compute bin sizes. We do this here to handle length-1 chunks src_x0, src_x1 = glyph._compute_bounds_from_1d_centers(xr_ds, x_name, maybe_expand=False, orient=False) src_y0, src_y1 = glyph._compute_bounds_from_1d_centers(xr_ds, y_name, maybe_expand=False, orient=False) xbinsize = float(xr_ds[x_name][1] - xr_ds[x_name][0]) ybinsize = float(xr_ds[y_name][1] - xr_ds[y_name][0]) # Compute scale/translate out_h, out_w = shape src_h, src_w = [xr_ds[glyph.name].shape[i] for i in [ydim_ind, xdim_ind]] out_x0, out_x1, out_y0, out_y1 = bounds scale_y, translate_y = build_scale_translate(out_h, out_y0, out_y1, src_h, src_y0, src_y1) scale_x, translate_x = build_scale_translate(out_w, out_x0, out_x1, src_w, src_x0, src_x1) def chunk(np_arr, *inds): # Reconstruct dataset for chunk from numpy array and chunk indices chunk_coords_list = [] for i, coord_name in enumerate(coords.dims): chunk_number = inds[i] coord_slice = slice(chunk_inds[coord_name][chunk_number], chunk_inds[coord_name][chunk_number + 1]) chunk_coords_list.append( [coord_name, coords[coord_name][coord_slice]]) chunk_coords = OrderedDict(chunk_coords_list) chunk_ds = xr.DataArray(np_arr, coords=chunk_coords, dims=coord_dims, name=var_name).to_dataset() # Compute offsets x_chunk_number = inds[xdim_ind] offset_x = chunk_inds[x_name][x_chunk_number] y_chunk_number = inds[ydim_ind] offset_y = chunk_inds[y_name][y_chunk_number] # Initialize aggregation buffers aggs = create(shape) # Perform aggregation extend(aggs, chunk_ds, st, bounds, scale_x=scale_x, scale_y=scale_y, translate_x=translate_x, translate_y=translate_y, offset_x=offset_x, offset_y=offset_y, src_xbinsize=xbinsize, src_ybinsize=ybinsize) return aggs name = tokenize(xr_ds.__dask_tokenize__(), canvas, glyph, summary) keys = [k for row in xr_ds.__dask_keys__()[0] for k in row] keys2 = [(name, i) for i in range(len(keys))] dsk = dict((k2, (chunk, k, k[1], k[2])) for (k2, k) in zip(keys2, keys)) dsk[name] = (apply, finalize, [(combine, keys2)], dict(cuda=cuda, coords=axis, dims=[glyph.y_label, glyph.x_label])) return dsk, name
def default(glyph, df, schema, canvas, summary, cuda=False): shape, bounds, st, axis = shape_bounds_st_and_axis(df, canvas, glyph) # Compile functions create, info, append, combine, finalize = \ compile_components(summary, schema, glyph, cuda=cuda) x_mapper = canvas.x_axis.mapper y_mapper = canvas.y_axis.mapper extend = glyph._build_extend(x_mapper, y_mapper, info, append) # Here be dragons # Get the dataframe graph graph = df.__dask_graph__() # Guess a reasonable output dtype from combination of dataframe dtypes dtypes = [] for dt in df.dtypes: if isinstance(dt, pd.CategoricalDtype): continue elif isinstance(dt, pd.api.extensions.ExtensionDtype): # RaggedArray implementation and # https://github.com/pandas-dev/pandas/issues/22224 try: subdtype = dt.subtype except AttributeError: continue else: dtypes.append(subdtype) else: dtypes.append(dt) dtype = np.result_type(*dtypes) # Create a meta object so that dask.array doesn't try to look # too closely at the type of the chunks it's wrapping # they're actually dataframes, tell dask they're ndarrays meta = np.empty((0, ), dtype=dtype) # Create a chunks tuple, a singleton for each dataframe chunk # The number of chunks + structure needs to match that of # the dataframe, so that we can use the dataframe graph keys, # but we don't have to be precise with the chunk size. # We could use np.nan instead of 1 to indicate that we actually # don't know how large the chunk is chunks = (tuple(1 for _ in range(df.npartitions)), ) # Now create a dask array from the dataframe graph layer # It's a dask array of dataframes, which is dodgy but useful # for the following reasons: # # (1) The dataframes get converted to a single array by # the datashader reduction functions anyway # (2) dask.array.reduction is handy for coding a tree # reduction of arrays df_array = da.Array(graph, df._name, chunks, meta=meta) # A sufficient condition for ensuring the chimera holds together assert list(df_array.__dask_keys__()) == list(df.__dask_keys__()) def chunk(df, axis, keepdims): """ used in the dask.array.reduction chunk step """ aggs = create(shape) extend(aggs, df, st, bounds) return aggs def wrapped_combine(x, axis, keepdims): """ wrap datashader combine in dask.array.reduction combine """ if isinstance(x, list): # list of tuples of ndarrays # assert all(isinstance(item, tuple) and # len(item) == 1 and # isinstance(item[0], np.ndarray) # for item in x) return combine(x) elif isinstance(x, tuple): # tuple with single ndarray # assert len(x) == 1 and isinstance(x[0], np.ndarray) return x else: raise TypeError("Unknown type %s in wrapped_combine" % type(x)) local_axis = axis def aggregate(x, axis, keepdims): """ Wrap datashader finalize in dask.array.reduction aggregate """ return finalize(wrapped_combine(x, axis, keepdims), cuda=cuda, coords=local_axis, dims=[glyph.y_label, glyph.x_label]) R = da.reduction( df_array, aggregate=aggregate, chunk=chunk, combine=wrapped_combine, # Control granularity of tree branching # less is more split_every=2, # We don't want np.concatenate called # during combine and aggregate. It'll # fail because we're handling tuples of ndarrays # and lists of tuples of ndarrays concatenate=False, # Prevent dask from internally inspecting # chunk, combine and aggrregate meta=meta, # Provide some sort of dtype for the # resultant dask array dtype=meta.dtype) return R, R.name