def test_reductions(): x = np.arange(5).astype('f4') a = da.from_array(x, chunks=(2,)) assert eq(da.all(a), np.all(x)) assert eq(da.any(a), np.any(x)) assert eq(da.argmax(a, axis=0), np.argmax(x, axis=0)) assert eq(da.argmin(a, axis=0), np.argmin(x, axis=0)) assert eq(da.max(a), np.max(x)) assert eq(da.mean(a), np.mean(x)) assert eq(da.min(a), np.min(x)) assert eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0)) assert eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0)) assert eq(da.nanmax(a), np.nanmax(x)) assert eq(da.nanmin(a), np.nanmin(x)) assert eq(da.nansum(a), np.nansum(x)) assert eq(da.nanvar(a), np.nanvar(x)) assert eq(da.nanstd(a), np.nanstd(x))
def test_make_regression(n_samples, n_features, n_informative, n_targets, bias, effective_rank, tail_strength, noise, shuffle, coef, n_parts, order, use_full_low_rank, client): c = client from cuml.dask.datasets import make_regression result = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_informative, n_targets=n_targets, bias=bias, effective_rank=effective_rank, noise=noise, shuffle=shuffle, coef=coef, n_parts=n_parts, use_full_low_rank=use_full_low_rank, order=order) if coef: out, values, coefs = result else: out, values = result assert out.shape == (n_samples, n_features), "out shape mismatch" if n_targets > 1: assert values.shape == (n_samples, n_targets), \ "values shape mismatch" else: assert values.shape == (n_samples,), "values shape mismatch" assert len(out.chunks[0]) == n_parts assert len(out.chunks[1]) == 1 if coef: if n_targets > 1: assert coefs.shape == (n_features, n_targets), \ "coefs shape mismatch" assert len(coefs.chunks[1]) == 1 else: assert coefs.shape == (n_features,), "coefs shape mismatch" assert len(coefs.chunks[0]) == 1 test1 = da.all(da.sum(coefs != 0.0, axis=0) == n_informative) std_test2 = da.std(values - (da.dot(out, coefs) + bias), axis=0) test1, std_test2 = da.compute(test1, std_test2) diff = cp.abs(1.0 - std_test2) test2 = cp.all(diff < 1.5 * 10**(-1.)) assert test1, \ "Unexpected number of informative features" assert test2, "Unexpectedly incongruent outputs" data_ddh = DistributedDataHandler.create(data=(out, values), client=c) out_part, value_part = data_ddh.gpu_futures[0][1].result() if coef: coefs_ddh = DistributedDataHandler.create(data=coefs, client=c) coefs_part = coefs_ddh.gpu_futures[0][1].result() if order == 'F': assert out_part.flags['F_CONTIGUOUS'] if n_targets > 1: assert value_part.flags['F_CONTIGUOUS'] if coef: assert coefs_part.flags['F_CONTIGUOUS'] elif order == 'C': assert out_part.flags['C_CONTIGUOUS'] if n_targets > 1: assert value_part.flags['C_CONTIGUOUS'] if coef: assert coefs_part.flags['C_CONTIGUOUS']
# https://software.intel.com/en-us/blogs/2016/04/04/unleash-parallel-performance-of-python-programs import dask, time import dask.array as da x = da.random.random((100000, 2000), chunks=(10000, 2000)) t0 = time.time() q, r = da.linalg.qr(x) test = da.all(da.isclose(x, q.dot(r))) assert(test.compute()) # compute(get=dask.threaded.get) by default print(time.time() - t0) # python -m TBB intelCompilerTest.py
def dataset_chunks(datasets, time_bin_secs, max_row_chunks): """ Given ``max_row_chunks`` determine a chunking strategy for each dataset that prevents binning unique times in separate chunks. """ # Calculate (utime, idx, counts) tuple for each dataset # then tranpose to get lists for each tuple entry if len(datasets) == 0: return (), () utimes = [] interval_avg = [] counts = [] monotonicity_checks = [] for ds in datasets: # Compute unique times, their counts and interval sum # for each row chunk block_values = da.blockwise(_time_interval_sum, "r", ds.TIME.data, "r", ds.INTERVAL.data, "r", meta=np.empty((0, ), dtype=np.object), dtype=np.object) # Reduce each row chunk's values reduction = da.reduction(block_values, chunk=_chunk, combine=_time_int_combine, aggregate=_time_int_agg, concatenate=False, split_every=16, meta=np.empty((0, ), dtype=np.object), dtype=np.object) # Pull out the final unique times, counts and interval average utime = reduction.map_blocks(getitem, 0, dtype=ds.TIME.dtype) count = reduction.map_blocks(getitem, 1, dtype=np.int32) int_avg = reduction.map_blocks(getitem, 2, dtype=ds.INTERVAL.dtype) # Check monotonicity of TIME while we're at it is_monotonic = da.all(da.diff(ds.TIME.data) >= 0.0) utimes.append(utime) counts.append(count) interval_avg.append(int_avg) monotonicity_checks.append(is_monotonic) # Work out the unique times, average intervals for those times # and the frequency of those times (ds_utime, ds_avg_intervals, ds_counts, ds_monotonicity_checks) = dask.compute(utimes, interval_avg, counts, monotonicity_checks) if not all(ds_monotonicity_checks): raise ValueError("TIME is not monotonically increasing. " "This is required.") # Produce row and time chunking strategies for each dataset ds_row_chunks = [] ds_time_chunks = [] ds_interval_secs = [] it = zip(ds_utime, ds_avg_intervals, ds_counts) for di, (utime, avg_interval, counts) in enumerate(it): # Maintain row and time chunks for this dataset row_chunks = [] time_chunks = [] interval_secs = [] # Start out with first entries bin_rows = counts[0] bin_times = 1 bin_secs = avg_interval[0] dsit = enumerate(zip(utime[1:], avg_interval[1:], counts[1:])) for ti, (ut, avg_int, count) in dsit: if count > max_row_chunks: logger.warning( "Unique time {:3f} occurred {:d} times " "in dataset {:d} but this exceeds the " "requested row chunks {:d}. " "Consider increasing --row-chunks", ut, count, di, max_row_chunks) if avg_int > time_bin_secs: logger.warning( "The average INTERVAL associated with " "unique time {:3f} in dataset {:d} " "is {:3f} but this exceeds the requested " "number of seconds in a time bin {:3f}s. " "Consider increasing --time-bin-secs", ut, di, avg_int, time_bin_secs) next_rows = bin_rows + count # If we're still within the number of rows for this bin # keep going if next_rows < max_row_chunks: bin_rows = next_rows bin_times += 1 bin_secs += avg_int # Otherwise finalize this bin and # start a new one with the counts # we were trying to add else: row_chunks.append(bin_rows) time_chunks.append(bin_times) interval_secs.append(bin_secs) bin_rows = count bin_times = 1 bin_secs = avg_int # Finish any remaining bins if bin_rows > 0: assert bin_times > 0 row_chunks.append(bin_rows) time_chunks.append(bin_times) interval_secs.append(bin_secs) row_chunks = tuple(row_chunks) time_chunks = tuple(time_chunks) interval_secs = tuple(interval_secs) ds_row_chunks.append(row_chunks) ds_time_chunks.append(time_chunks) ds_interval_secs.append(interval_secs) logger.info("Dataset Chunking: (r)ow - (t)imes - (s)econds") it = zip(datasets, ds_row_chunks, ds_time_chunks, ds_interval_secs) for di, (ds, ds_rcs, ds_tcs, ds_int_secs) in enumerate(it): ds_rows = ds.dims['row'] ds_crows = sum(ds_rcs) if not ds_rows == ds_crows: raise ValueError("Number of dataset rows %d " "does not match the sum %d " "of the row chunks %s" % (ds_rows, ds_crows, ds_rcs)) log_str = ", ".join("(%dr,%dt,%.1fs)" % (rc, tc, its) for rc, tc, its in zip(*(ds_rcs, ds_tcs, ds_int_secs))) logger.info("Dataset {d}: {s}", d=di, s=log_str) return ds_row_chunks, ds_time_chunks
def new_grid_mapping_from_coords( x_coords: xr.DataArray, y_coords: xr.DataArray, crs: Union[str, pyproj.crs.CRS], *, tile_size: Union[int, Tuple[int, int]] = None, tolerance: float = DEFAULT_TOLERANCE, ) -> GridMapping: crs = _normalize_crs(crs) assert_instance(x_coords, xr.DataArray, name='x_coords') assert_instance(y_coords, xr.DataArray, name='y_coords') assert_true(x_coords.ndim in (1, 2), 'x_coords and y_coords must be either 1D or 2D arrays') assert_instance(tolerance, float, name='tolerance') assert_true(tolerance > 0.0, 'tolerance must be greater zero') if x_coords.name and y_coords.name: xy_var_names = str(x_coords.name), str(y_coords.name) else: xy_var_names = _default_xy_var_names(crs) tile_size = _normalize_int_pair(tile_size, default=None) is_lon_360 = None # None means "not yet known" if crs.is_geographic: is_lon_360 = bool(np.any(x_coords > 180)) x_res = 0 y_res = 0 if x_coords.ndim == 1: # We have 1D x,y coordinates cls = Coords1DGridMapping assert_true(x_coords.size >= 2 and y_coords.size >= 2, 'sizes of x_coords and y_coords 1D arrays must be >= 2') size = x_coords.size, y_coords.size x_dim, y_dim = x_coords.dims[0], y_coords.dims[0] x_diff = _abs_no_zero(x_coords.diff(dim=x_dim).values) y_diff = _abs_no_zero(y_coords.diff(dim=y_dim).values) if not is_lon_360 and crs.is_geographic: is_anti_meridian_crossed = np.any(np.nanmax(x_diff) > 180) if is_anti_meridian_crossed: x_coords = to_lon_360(x_coords) x_diff = _abs_no_zero(x_coords.diff(dim=x_dim)) is_lon_360 = True x_res, y_res = x_diff[0], y_diff[0] x_diff_equal = np.allclose(x_diff, x_res, atol=tolerance) y_diff_equal = np.allclose(y_diff, y_res, atol=tolerance) is_regular = x_diff_equal and y_diff_equal if is_regular: x_res = round_to_fraction(x_res, 5, 0.25) y_res = round_to_fraction(y_res, 5, 0.25) else: x_res = round_to_fraction(float(np.nanmedian(x_diff)), 2, 0.5) y_res = round_to_fraction(float(np.nanmedian(y_diff)), 2, 0.5) if tile_size is None \ and x_coords.chunks is not None \ and y_coords.chunks is not None: tile_size = (max(0, *x_coords.chunks[0]), max(0, *y_coords.chunks[0])) # Guess j axis direction is_j_axis_up = bool(y_coords[0] < y_coords[-1]) else: # We have 2D x,y coordinates cls = Coords2DGridMapping assert_true( x_coords.shape == y_coords.shape, 'shapes of x_coords and y_coords' ' 2D arrays must be equal') assert_true( x_coords.dims == y_coords.dims, 'dimensions of x_coords and y_coords' ' 2D arrays must be equal') y_dim, x_dim = x_coords.dims height, width = x_coords.shape size = width, height x = da.asarray(x_coords) y = da.asarray(y_coords) x_x_diff = _abs_no_nan(da.diff(x, axis=1)) x_y_diff = _abs_no_nan(da.diff(x, axis=0)) y_x_diff = _abs_no_nan(da.diff(y, axis=1)) y_y_diff = _abs_no_nan(da.diff(y, axis=0)) if not is_lon_360 and crs.is_geographic: is_anti_meridian_crossed = da.any(da.max(x_x_diff) > 180) \ or da.any(da.max(x_y_diff) > 180) if is_anti_meridian_crossed: x_coords = to_lon_360(x_coords) x = da.asarray(x_coords) x_x_diff = _abs_no_nan(da.diff(x, axis=1)) x_y_diff = _abs_no_nan(da.diff(x, axis=0)) is_lon_360 = True is_regular = False if da.all(x_y_diff == 0) and da.all(y_x_diff == 0): x_res = x_x_diff[0, 0] y_res = y_y_diff[0, 0] is_regular = \ da.allclose(x_x_diff[0, :], x_res, atol=tolerance) \ and da.allclose(x_x_diff[-1, :], x_res, atol=tolerance) \ and da.allclose(y_y_diff[:, 0], y_res, atol=tolerance) \ and da.allclose(y_y_diff[:, -1], y_res, atol=tolerance) if not is_regular: # Let diff arrays have same shape as original by # doubling last rows and columns. x_x_diff_c = da.concatenate([x_x_diff, x_x_diff[:, -1:]], axis=1) y_x_diff_c = da.concatenate([y_x_diff, y_x_diff[:, -1:]], axis=1) x_y_diff_c = da.concatenate([x_y_diff, x_y_diff[-1:, :]], axis=0) y_y_diff_c = da.concatenate([y_y_diff, y_y_diff[-1:, :]], axis=0) # Find resolution via area x_abs_diff = da.sqrt(da.square(x_x_diff_c) + da.square(x_y_diff_c)) y_abs_diff = da.sqrt(da.square(y_x_diff_c) + da.square(y_y_diff_c)) if crs.is_geographic: # Convert degrees into meters x_abs_diff_r = da.radians(x_abs_diff) y_abs_diff_r = da.radians(y_abs_diff) x_abs_diff = _ER * da.cos(x_abs_diff_r) * y_abs_diff_r y_abs_diff = _ER * y_abs_diff_r xy_areas = (x_abs_diff * y_abs_diff).flatten() xy_areas = da.where(xy_areas > 0, xy_areas, np.nan) # Get indices of min and max area xy_area_index_min = da.nanargmin(xy_areas) xy_area_index_max = da.nanargmax(xy_areas) # Convert area to edge length xy_res_min = math.sqrt(xy_areas[xy_area_index_min]) xy_res_max = math.sqrt(xy_areas[xy_area_index_max]) # Empirically weight min more than max xy_res = 0.7 * xy_res_min + 0.3 * xy_res_max if crs.is_geographic: # Convert meters back into degrees # print(f'xy_res in meters: {xy_res}') xy_res = math.degrees(xy_res / _ER) # print(f'xy_res in degrees: {xy_res}') # Because this is an estimation, we can round to a nice number xy_res = round_to_fraction(xy_res, digits=1, resolution=0.5) x_res, y_res = float(xy_res), float(xy_res) if tile_size is None and x_coords.chunks is not None: j_chunks, i_chunks = x_coords.chunks tile_size = max(0, *i_chunks), max(0, *j_chunks) if tile_size is not None: tile_width, tile_height = tile_size x_coords = x_coords.chunk((tile_height, tile_width)) y_coords = y_coords.chunk((tile_height, tile_width)) # Guess j axis direction is_j_axis_up = np.all(y_coords[0, :] < y_coords[-1, :]) or None assert_true(x_res > 0 and y_res > 0, 'internal error: x_res and y_res could not be determined', exception_type=RuntimeError) x_res, y_res = _to_int_or_float(x_res), _to_int_or_float(y_res) x_res_05, y_res_05 = x_res / 2, y_res / 2 x_min = _to_int_or_float(x_coords.min() - x_res_05) y_min = _to_int_or_float(y_coords.min() - y_res_05) x_max = _to_int_or_float(x_coords.max() + x_res_05) y_max = _to_int_or_float(y_coords.max() + y_res_05) return cls(x_coords=x_coords, y_coords=y_coords, crs=crs, size=size, tile_size=tile_size, xy_bbox=(x_min, y_min, x_max, y_max), xy_res=(x_res, y_res), xy_var_names=xy_var_names, xy_dim_names=(str(x_dim), str(y_dim)), is_regular=is_regular, is_lon_360=is_lon_360, is_j_axis_up=is_j_axis_up)
def lengths_and_angles_to_box_vectors(a_length, b_length, c_length, alpha, beta, gamma): """Convert from the lengths/angles of the unit cell to the box vectors (Bravais vectors). The angles should be in degrees. Mimics mdtraj.core.unitcell.lengths_and_angles_to_box_vectors() Parameters ---------- a_length : scalar or ndarray length of Bravais unit vector **a** b_length : scalar or ndarray length of Bravais unit vector **b** c_length : scalar or ndarray length of Bravais unit vector **c** alpha : scalar or ndarray angle between vectors **b** and **c**, in degrees. beta : scalar or ndarray angle between vectors **c** and **a**, in degrees. gamma : scalar or ndarray angle between vectors **a** and **b**, in degrees. Returns ------- a : dask.array If the inputs are scalar, the vectors will one dimensional (length 3). If the inputs are one dimension, shape=(n_frames, ), then the output will be (n_frames, 3) b : dask.array If the inputs are scalar, the vectors will one dimensional (length 3). If the inputs are one dimension, shape=(n_frames, ), then the output will be (n_frames, 3) c : dask.array If the inputs are scalar, the vectors will one dimensional (length 3). If the inputs are one dimension, shape=(n_frames, ), then the output will be (n_frames, 3) This code is adapted from gyroid, which is licensed under the BSD http://pythonhosted.org/gyroid/_modules/gyroid/unitcell.html """ # Fix for da that requires angles and lengths to be arrays lengths = [a_length, b_length, c_length] for i, e in enumerate(lengths): # Use python logic shortcutting to not compute dask Arrays if not isinstance(e, da.core.Array) and np.isscalar(e): lengths[i] = np.array([e]) a_length, b_length, c_length = tuple(lengths) angles = [alpha, beta, gamma] for i, e in enumerate(angles): if not isinstance(e, da.core.Array) and np.isscalar(e): angles[i] = np.array([e]) alpha, beta, gamma = tuple(angles) if da.all(alpha < 2 * np.pi) and ( da.all(beta < 2 * np.pi) and da.all(gamma < 2 * np.pi) ): warnings.warn( "All your angles were less than 2*pi." " Did you accidentally give me radians?" ) alpha = alpha * np.pi / 180 beta = beta * np.pi / 180 gamma = gamma * np.pi / 180 a = da.stack([a_length, da.zeros_like(a_length), da.zeros_like(a_length)]) b = da.stack( [b_length * da.cos(gamma), b_length * da.sin(gamma), da.zeros_like(b_length)] ) cx = c_length * da.cos(beta) cy = c_length * (da.cos(alpha) - da.cos(beta) * da.cos(gamma)) / da.sin(gamma) cz = da.sqrt(c_length * c_length - cx * cx - cy * cy) c = da.stack([cx, cy, cz]) if not a.shape == b.shape == c.shape: raise TypeError("Shape is messed up.") # Make sure that all vector components that are _almost_ 0 are set exactly # to 0 tol = 1e-6 a[da.logical_and(a > -tol, a < tol)] = 0.0 b[da.logical_and(b > -tol, b < tol)] = 0.0 c[da.logical_and(c > -tol, c < tol)] = 0.0 return a.T, b.T, c.T
#jetM = jetM_[runMask][:nJets] #print " >> %s: %s"%('jetM', jetM.shape) #jetPt = jetPt_[runMask][:nJets] #print " >> %s: %s"%('jetPt', jetPt.shape) X_jets0 = X_jets0_[runMask][:nJets] print " >> %s: %s" % ('X_jets', X_jets0.shape) X_jets1 = X_jets1_[runMask][:nJets] print " >> %s: %s" % ('X_jets', X_jets1.shape) X_FC = X_FC_[runMask][:nJets] print " >> %s: %s" % ('X_FC', X_FC.shape) #X_ECAL_stacked = X_ECAL_stacked_[runMask][:nJets] #print " >> %s: %s"%('X_ECAL_stacked', X_ECAL_stacked.shape) y_jets = y_jets_[runMask][:nJets] print " >> %s: %s" % ('y_jets', y_jets.shape) assert da.all(jetEventId == jetEventId1) #file_out_str = "test_jets.hdf5" file_out_str = "%s/%s/%s_n%d_label%d_jetcombo_run%d.hdf5" % ( eosDir, decay, decay, nJets, label, i) print " >> Writing to:", file_out_str da.to_hdf5( file_out_str, { #'runId': runId, #'lumiId': lumiId, #'eventId': eventId, #'X_ECAL_stacked': X_ECAL_stacked, #'y': y, 'jetRunId': jetRunId, 'jetEventId': jetEventId,
def dataset_chunks(datasets, time_bin_secs, max_row_chunks): """ Given ``max_row_chunks`` determine a chunking strategy for each dataset that prevents binning unique times in separate chunks. """ # Calculate (utime, idx, counts) tuple for each dataset # then tranpose to get lists for each tuple entry if len(datasets) == 0: return (), () utimes = [] interval_avg = [] counts = [] monotonicity_checks = [] for ds in datasets: # Compute unique times, their counts and interval sum # for each row chunk block_values = da.blockwise(_time_interval_sum, "r", ds.TIME.data, "r", ds.INTERVAL.data, "r", meta=np.empty((0, ), dtype=np.object), dtype=np.object) # Reduce each row chunk's values reduction = da.reduction(block_values, chunk=_chunk, combine=_time_int_combine, aggregate=_time_int_agg, concatenate=False, split_every=16, meta=np.empty((0, ), dtype=np.object), dtype=np.object) # Pull out the final unique times, counts and interval average utime = reduction.map_blocks(getitem, 0, dtype=ds.TIME.dtype) count = reduction.map_blocks(getitem, 1, dtype=np.int32) int_avg = reduction.map_blocks(getitem, 2, dtype=ds.INTERVAL.dtype) # Check monotonicity of TIME while we're at it is_monotonic = da.all(da.diff(ds.TIME.data) >= 0.0) utimes.append(utime) counts.append(count) interval_avg.append(int_avg) monotonicity_checks.append(is_monotonic) # Work out the unique times, average intervals for those times # and the frequency of those times (ds_utime, ds_avg_intervals, ds_counts, ds_monotonicity_checks) = dask.compute(utimes, interval_avg, counts, monotonicity_checks) if not all(ds_monotonicity_checks): raise ValueError("TIME is not monotonically increasing. " "This is required.") grouper = DatasetGrouper(time_bin_secs, max_row_chunks) res = grouper.group(ds_utime, ds_avg_intervals, ds_counts) ds_row_chunks, ds_time_chunks, ds_interval_secs = res logger.info("Dataset Chunking: (r)ow - (t)imes - (s)econds") it = zip(datasets, ds_row_chunks, ds_time_chunks, ds_interval_secs) for di, (ds, ds_rcs, ds_tcs, ds_int_secs) in enumerate(it): ds_rows = ds.dims['row'] ds_crows = sum(ds_rcs) if not ds_rows == ds_crows: raise ValueError("Number of dataset rows %d " "does not match the sum %d " "of the row chunks %s" % (ds_rows, ds_crows, ds_rcs)) log_str = ", ".join("(%dr,%dt,%.1fs)" % (rc, tc, its) for rc, tc, its in zip(*(ds_rcs, ds_tcs, ds_int_secs))) logger.info("Dataset {d}: {s}", d=di, s=log_str) return ds_row_chunks, ds_time_chunks
def _bench(self, get): q, r = da.linalg.qr(self.x) test = da.all(da.isclose(self.x, q.dot(r))) test.compute(get=get)
def qr(x): t0 = time.time() q, r = da.linalg.qr(x) test = da.all(da.isclose(x, q.dot(r))) test.compute() print(time.time() - t0)