def test_reductions_2D_int(): x = np.arange(1, 122).reshape((11, 11)).astype('i4') a = da.from_array(x, chunks=(4, 4)) reduction_2d_test(da.sum, a, np.sum, x) reduction_2d_test(da.prod, a, np.prod, x) reduction_2d_test(da.mean, a, np.mean, x) reduction_2d_test(da.var, a, np.var, x, False) # Difference in dtype algo reduction_2d_test(da.std, a, np.std, x, False) # Difference in dtype algo reduction_2d_test(da.min, a, np.min, x, False) reduction_2d_test(da.max, a, np.max, x, False) reduction_2d_test(da.any, a, np.any, x, False) reduction_2d_test(da.all, a, np.all, x, False) reduction_2d_test(da.nansum, a, np.nansum, x) with ignoring(AttributeError): reduction_2d_test(da.nanprod, a, np.nanprod, x) reduction_2d_test(da.nanmean, a, np.mean, x) reduction_2d_test(da.nanvar, a, np.nanvar, x, False) # Difference in dtype algo reduction_2d_test(da.nanstd, a, np.nanstd, x, False) # Difference in dtype algo reduction_2d_test(da.nanmin, a, np.nanmin, x, False) reduction_2d_test(da.nanmax, a, np.nanmax, x, False) assert eq(da.argmax(a, axis=0), np.argmax(x, axis=0)) assert eq(da.argmin(a, axis=0), np.argmin(x, axis=0)) assert eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0)) assert eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0)) assert eq(da.argmax(a, axis=1), np.argmax(x, axis=1)) assert eq(da.argmin(a, axis=1), np.argmin(x, axis=1)) assert eq(da.nanargmax(a, axis=1), np.nanargmax(x, axis=1)) assert eq(da.nanargmin(a, axis=1), np.nanargmin(x, axis=1))
def test_reductions_1D(dtype): x = np.arange(5).astype(dtype) a = da.from_array(x, chunks=(2,)) reduction_1d_test(da.sum, a, np.sum, x) reduction_1d_test(da.prod, a, np.prod, x) reduction_1d_test(da.mean, a, np.mean, x) reduction_1d_test(da.var, a, np.var, x) reduction_1d_test(da.std, a, np.std, x) reduction_1d_test(da.min, a, np.min, x, False) reduction_1d_test(da.max, a, np.max, x, False) reduction_1d_test(da.any, a, np.any, x, False) reduction_1d_test(da.all, a, np.all, x, False) reduction_1d_test(da.nansum, a, np.nansum, x) with ignoring(AttributeError): reduction_1d_test(da.nanprod, a, np.nanprod, x) reduction_1d_test(da.nanmean, a, np.mean, x) reduction_1d_test(da.nanvar, a, np.var, x) reduction_1d_test(da.nanstd, a, np.std, x) reduction_1d_test(da.nanmin, a, np.nanmin, x, False) reduction_1d_test(da.nanmax, a, np.nanmax, x, False) assert eq(da.argmax(a, axis=0), np.argmax(x, axis=0)) assert eq(da.argmin(a, axis=0), np.argmin(x, axis=0)) assert eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0)) assert eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0)) assert eq(da.argmax(a, axis=0, split_every=2), np.argmax(x, axis=0)) assert eq(da.argmin(a, axis=0, split_every=2), np.argmin(x, axis=0)) assert eq(da.nanargmax(a, axis=0, split_every=2), np.nanargmax(x, axis=0)) assert eq(da.nanargmin(a, axis=0, split_every=2), np.nanargmin(x, axis=0))
def test_reductions_2D_nans(): # chunks are a mix of some/all/no NaNs x = np.full((4, 4), np.nan) x[:2, :2] = np.array([[1, 2], [3, 4]]) x[2, 2] = 5 x[3, 3] = 6 a = da.from_array(x, chunks=(2, 2)) reduction_2d_test(da.sum, a, np.sum, x, False, False) reduction_2d_test(da.prod, a, np.prod, x, False, False) reduction_2d_test(da.mean, a, np.mean, x, False, False) reduction_2d_test(da.var, a, np.var, x, False, False) reduction_2d_test(da.std, a, np.std, x, False, False) reduction_2d_test(da.min, a, np.min, x, False, False) reduction_2d_test(da.max, a, np.max, x, False, False) reduction_2d_test(da.any, a, np.any, x, False, False) reduction_2d_test(da.all, a, np.all, x, False, False) reduction_2d_test(da.nansum, a, np.nansum, x, False, False) reduction_2d_test(da.nanprod, a, np.nanprod, x, False, False) reduction_2d_test(da.nanmean, a, np.nanmean, x, False, False) with pytest.warns(None): # division by 0 warning reduction_2d_test(da.nanvar, a, np.nanvar, x, False, False) with pytest.warns(None): # division by 0 warning reduction_2d_test(da.nanstd, a, np.nanstd, x, False, False) with pytest.warns(None): # all NaN axis warning reduction_2d_test(da.nanmin, a, np.nanmin, x, False, False) with pytest.warns(None): # all NaN axis warning reduction_2d_test(da.nanmax, a, np.nanmax, x, False, False) assert_eq(da.argmax(a), np.argmax(x)) assert_eq(da.argmin(a), np.argmin(x)) with pytest.warns(None): # all NaN axis warning assert_eq(da.nanargmax(a), np.nanargmax(x)) with pytest.warns(None): # all NaN axis warning assert_eq(da.nanargmin(a), np.nanargmin(x)) assert_eq(da.argmax(a, axis=0), np.argmax(x, axis=0)) assert_eq(da.argmin(a, axis=0), np.argmin(x, axis=0)) with pytest.warns(None): # all NaN axis warning assert_eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0)) with pytest.warns(None): # all NaN axis warning assert_eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0)) assert_eq(da.argmax(a, axis=1), np.argmax(x, axis=1)) assert_eq(da.argmin(a, axis=1), np.argmin(x, axis=1)) with pytest.warns(None): # all NaN axis warning assert_eq(da.nanargmax(a, axis=1), np.nanargmax(x, axis=1)) with pytest.warns(None): # all NaN axis warning assert_eq(da.nanargmin(a, axis=1), np.nanargmin(x, axis=1))
def test_reductions_2D_nans(): # chunks are a mix of some/all/no NaNs x = np.full((4, 4), np.nan) x[:2, :2] = np.array([[1, 2], [3, 4]]) x[2, 2] = 5 x[3, 3] = 6 a = da.from_array(x, chunks=(2, 2)) reduction_2d_test(da.sum, a, np.sum, x, False, False) reduction_2d_test(da.prod, a, np.prod, x, False, False) reduction_2d_test(da.mean, a, np.mean, x, False, False) reduction_2d_test(da.var, a, np.var, x, False, False) reduction_2d_test(da.std, a, np.std, x, False, False) reduction_2d_test(da.min, a, np.min, x, False, False) reduction_2d_test(da.max, a, np.max, x, False, False) reduction_2d_test(da.any, a, np.any, x, False, False) reduction_2d_test(da.all, a, np.all, x, False, False) reduction_2d_test(da.nansum, a, np.nansum, x, False, False) reduction_2d_test(da.nanprod, a, nanprod, x, False, False) reduction_2d_test(da.nanmean, a, np.nanmean, x, False, False) with pytest.warns(None): # division by 0 warning reduction_2d_test(da.nanvar, a, np.nanvar, x, False, False) with pytest.warns(None): # division by 0 warning reduction_2d_test(da.nanstd, a, np.nanstd, x, False, False) with pytest.warns(None): # all NaN axis warning reduction_2d_test(da.nanmin, a, np.nanmin, x, False, False) with pytest.warns(None): # all NaN axis warning reduction_2d_test(da.nanmax, a, np.nanmax, x, False, False) assert_eq(da.argmax(a), np.argmax(x)) assert_eq(da.argmin(a), np.argmin(x)) with pytest.warns(None): # all NaN axis warning assert_eq(da.nanargmax(a), np.nanargmax(x)) with pytest.warns(None): # all NaN axis warning assert_eq(da.nanargmin(a), np.nanargmin(x)) assert_eq(da.argmax(a, axis=0), np.argmax(x, axis=0)) assert_eq(da.argmin(a, axis=0), np.argmin(x, axis=0)) with pytest.warns(None): # all NaN axis warning assert_eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0)) with pytest.warns(None): # all NaN axis warning assert_eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0)) assert_eq(da.argmax(a, axis=1), np.argmax(x, axis=1)) assert_eq(da.argmin(a, axis=1), np.argmin(x, axis=1)) with pytest.warns(None): # all NaN axis warning assert_eq(da.nanargmax(a, axis=1), np.nanargmax(x, axis=1)) with pytest.warns(None): # all NaN axis warning assert_eq(da.nanargmin(a, axis=1), np.nanargmin(x, axis=1))
def test_reductions_1D_int(): x = np.arange(5).astype('i4') a = da.from_array(x, chunks=(2, )) reduction_1d_test(da.sum, a, np.sum, x) reduction_1d_test(da.prod, a, np.prod, x) reduction_1d_test(da.mean, a, np.mean, x) reduction_1d_test(da.var, a, np.var, x) reduction_1d_test(da.std, a, np.std, x) reduction_1d_test(da.min, a, np.min, x, False) reduction_1d_test(da.max, a, np.max, x, False) reduction_1d_test(da.any, a, np.any, x, False) reduction_1d_test(da.all, a, np.all, x, False) reduction_1d_test(da.nansum, a, np.nansum, x) with ignoring(AttributeError): reduction_1d_test(da.nanprod, a, np.nanprod, x) reduction_1d_test(da.nanmean, a, np.mean, x) reduction_1d_test(da.nanvar, a, np.var, x) reduction_1d_test(da.nanstd, a, np.std, x) reduction_1d_test(da.nanmin, a, np.nanmin, x, False) reduction_1d_test(da.nanmax, a, np.nanmax, x, False) assert eq(da.argmax(a, axis=0), np.argmax(x, axis=0)) assert eq(da.argmin(a, axis=0), np.argmin(x, axis=0)) assert eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0)) assert eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0))
def grid_search_model(data, model_meta, folds=5): """ Perform Grid Search Cross Validation on the input model :param data: a pandas dataframe where each row is an hour :param model_meta: An dict containing the name for the model ("name"), the sklearn estimator ("model"), and the parameters for Grid Search Cross Validation ("params") :param folds: The number of splits for cross validation :return: a tuple containing the best R^2 score found, the parameters used to obtain that score, and the estimator retrained on the whole dataset """ model = model_meta["model"] model_params = model_meta["params"] model_name = model_meta["name"] X = data.drop("cnt", axis=1) y = data["cnt"] tscv=TimeSeriesSplit(n_splits=folds) grid_search = GridSearchCV(estimator=model, param_grid=model_params, scoring="r2", cv=tscv, refit=True) grid_search.fit(X, y) print("\tAverage result for best {}: {} +/- {:.5f}" .format(model_name, grid_search.best_score_, grid_search.cv_results_["std_test_score"][da.argmax(grid_search.cv_results_["mean_test_score"])])) print("\tBest parameters for {0}: {1}".format(model_name, grid_search.best_params_)) # Need metrics to choose model, best estimator will have already been retrained on whole data set return grid_search.best_score_, grid_search.best_params_, grid_search.best_estimator_
def max_and_argmax(data): """Returns max and argmax along last two axes. Last two axes should correspond to the x and y dimensions. Parameters ---------- data : dask array data with at least 3 dimensions Returns ------- weights : dask array max of `data` along the last two axes argmax : dask array argmax of `data` along the last two axes """ # Slap out a dimension to nicely apply argmax and max flatData = data.reshape(data.shape[:-2] + (-1, )) argmax = da.argmax(flatData, axis=-1) # We can forego calculating both max and argmax as soon as # we have da.take_along_axis() https://github.com/dask/dask/issues/3663 # Would a map_blocks of np.take_along_axis() work and be faster? weights = da.max(flatData, axis=-1) return weights, argmax
def get_labels(self) -> zarr.array: if "labels" not in self.data: self.data["labels"] = Raw.from_dask_array( self.path / "labels.zarr.zip", da.argmax(da.array(self.get_doc_topic_matrix()), axis=1), ) self.save() return self.data["labels"].get()
def predict(self, X): client = default_client() class_probs = predict(client, self._Booster, X) if class_probs.ndim > 1: cidx = da.argmax(class_probs, axis=1) else: cidx = (class_probs > 0).astype(np.int64) return cidx
def test_reductions_2D(dtype): x = np.arange(1, 122).reshape((11, 11)).astype(dtype) a = da.from_array(x, chunks=(4, 4)) b = a.sum(keepdims=True) assert b._keys() == [[(b.name, 0, 0)]] reduction_2d_test(da.sum, a, np.sum, x) reduction_2d_test(da.prod, a, np.prod, x) reduction_2d_test(da.mean, a, np.mean, x) reduction_2d_test(da.var, a, np.var, x, False) # Difference in dtype algo reduction_2d_test(da.std, a, np.std, x, False) # Difference in dtype algo reduction_2d_test(da.min, a, np.min, x, False) reduction_2d_test(da.max, a, np.max, x, False) reduction_2d_test(da.any, a, np.any, x, False) reduction_2d_test(da.all, a, np.all, x, False) reduction_2d_test(da.nansum, a, np.nansum, x) with ignoring(AttributeError): reduction_2d_test(da.nanprod, a, np.nanprod, x) reduction_2d_test(da.nanmean, a, np.mean, x) reduction_2d_test(da.nanvar, a, np.nanvar, x, False) # Difference in dtype algo reduction_2d_test(da.nanstd, a, np.nanstd, x, False) # Difference in dtype algo reduction_2d_test(da.nanmin, a, np.nanmin, x, False) reduction_2d_test(da.nanmax, a, np.nanmax, x, False) assert eq(da.argmax(a, axis=0), np.argmax(x, axis=0)) assert eq(da.argmin(a, axis=0), np.argmin(x, axis=0)) assert eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0)) assert eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0)) assert eq(da.argmax(a, axis=1), np.argmax(x, axis=1)) assert eq(da.argmin(a, axis=1), np.argmin(x, axis=1)) assert eq(da.nanargmax(a, axis=1), np.nanargmax(x, axis=1)) assert eq(da.nanargmin(a, axis=1), np.nanargmin(x, axis=1)) assert eq(da.argmax(a, axis=0, split_every=2), np.argmax(x, axis=0)) assert eq(da.argmin(a, axis=0, split_every=2), np.argmin(x, axis=0)) assert eq(da.nanargmax(a, axis=0, split_every=2), np.nanargmax(x, axis=0)) assert eq(da.nanargmin(a, axis=0, split_every=2), np.nanargmin(x, axis=0)) assert eq(da.argmax(a, axis=1, split_every=2), np.argmax(x, axis=1)) assert eq(da.argmin(a, axis=1, split_every=2), np.argmin(x, axis=1)) assert eq(da.nanargmax(a, axis=1, split_every=2), np.nanargmax(x, axis=1)) assert eq(da.nanargmin(a, axis=1, split_every=2), np.nanargmin(x, axis=1))
def test_reductions_2D_nans(): # chunks are a mix of some/all/no NaNs x = np.full((4, 4), np.nan) x[:2, :2] = np.array([[1, 2], [3, 4]]) x[2, 2] = 5 x[3, 3] = 6 a = da.from_array(x, chunks=(2, 2)) reduction_2d_test(da.sum, a, np.sum, x, False, False) reduction_2d_test(da.prod, a, np.prod, x, False, False) reduction_2d_test(da.mean, a, np.mean, x, False, False) reduction_2d_test(da.var, a, np.var, x, False, False) reduction_2d_test(da.std, a, np.std, x, False, False) reduction_2d_test(da.min, a, np.min, x, False, False) reduction_2d_test(da.max, a, np.max, x, False, False) reduction_2d_test(da.any, a, np.any, x, False, False) reduction_2d_test(da.all, a, np.all, x, False, False) reduction_2d_test(da.nansum, a, np.nansum, x, False, False) reduction_2d_test(da.nanprod, a, np.nanprod, x, False, False) with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) reduction_2d_test(da.nanmean, a, np.nanmean, x, False, False) reduction_2d_test(da.nanvar, a, np.nanvar, x, False, False) reduction_2d_test(da.nanstd, a, np.nanstd, x, False, False) reduction_2d_test(da.nanmin, a, np.nanmin, x, False, False) reduction_2d_test(da.nanmax, a, np.nanmax, x, False, False) assert_eq(da.argmax(a), np.argmax(x)) assert_eq(da.argmin(a), np.argmin(x)) assert_eq(da.nanargmax(a), np.nanargmax(x)) assert_eq(da.nanargmin(a), np.nanargmin(x)) assert_eq(da.argmax(a, axis=0), np.argmax(x, axis=0)) assert_eq(da.argmin(a, axis=0), np.argmin(x, axis=0)) assert_eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0)) assert_eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0)) assert_eq(da.argmax(a, axis=1), np.argmax(x, axis=1)) assert_eq(da.argmin(a, axis=1), np.argmin(x, axis=1)) assert_eq(da.nanargmax(a, axis=1), np.nanargmax(x, axis=1)) assert_eq(da.nanargmin(a, axis=1), np.nanargmin(x, axis=1))
def _get_first_unmasked_data(array, axis): """Get first unmasked value of an array along an axis.""" mask = da.ma.getmaskarray(array) numerical_mask = da.where(mask, -1.0, 1.0) indices_first_positive = da.argmax(numerical_mask, axis=axis) indices = da.meshgrid( *[da.arange(array.shape[i]) for i in range(array.ndim) if i != axis], indexing='ij') indices.insert(axis, indices_first_positive) first_unmasked_data = np.array(array)[tuple(indices)] return first_unmasked_data
def test_reductions_2D_nans(): # chunks are a mix of some/all/no NaNs x = np.full((4, 4), np.nan) x[:2, :2] = np.array([[1, 2], [3, 4]]) x[2, 2] = 5 x[3, 3] = 6 a = da.from_array(x, chunks=(2, 2)) reduction_2d_test(da.sum, a, np.sum, x, False, False) reduction_2d_test(da.prod, a, np.prod, x, False, False) reduction_2d_test(da.mean, a, np.mean, x, False, False) reduction_2d_test(da.var, a, np.var, x, False, False) reduction_2d_test(da.std, a, np.std, x, False, False) reduction_2d_test(da.min, a, np.min, x, False, False) reduction_2d_test(da.max, a, np.max, x, False, False) reduction_2d_test(da.any, a, np.any, x, False, False) reduction_2d_test(da.all, a, np.all, x, False, False) reduction_2d_test(da.nansum, a, np.nansum, x, False, False) with ignoring(AttributeError): reduction_2d_test(da.nanprod, a, np.nanprod, x, False, False) reduction_2d_test(da.nanmean, a, np.nanmean, x, False, False) reduction_2d_test(da.nanvar, a, np.nanvar, x, False, False) reduction_2d_test(da.nanstd, a, np.nanstd, x, False, False) reduction_2d_test(da.nanmin, a, np.nanmin, x, False, False) reduction_2d_test(da.nanmax, a, np.nanmax, x, False, False) assert eq(da.argmax(a), np.argmax(x)) assert eq(da.argmin(a), np.argmin(x)) assert eq(da.nanargmax(a), np.nanargmax(x)) assert eq(da.nanargmin(a), np.nanargmin(x)) assert eq(da.argmax(a, axis=0), np.argmax(x, axis=0)) assert eq(da.argmin(a, axis=0), np.argmin(x, axis=0)) assert eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0)) assert eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0)) assert eq(da.argmax(a, axis=1), np.argmax(x, axis=1)) assert eq(da.argmin(a, axis=1), np.argmin(x, axis=1)) assert eq(da.nanargmax(a, axis=1), np.nanargmax(x, axis=1)) assert eq(da.nanargmin(a, axis=1), np.nanargmin(x, axis=1))
def max_and_argmax(data): """Return the dask max and argmax of data along the last two axes, which corresponds to the x and y dimensions (uncomputed) """ # Slap out a dimension to nicely apply argmax and max flatData = data.reshape(data.shape[:-2] + (-1, )) argmax = da.argmax(flatData, axis=-1) # We can forego calculating both max and argmax as soon as # we have da.take_along_axis() https://github.com/dask/dask/issues/3663 # Would a map_blocks of np.take_along_axis() work and be faster? weights = da.max(flatData, axis=-1) return weights, argmax
def predict(self, X): """ Perform classification on an array of test vectors X. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- C : array, shape = [n_samples] Predicted target values for X """ jll = self._joint_log_likelihood(X) return delayed(self.classes_)[da.argmax(jll, axis=1)]
def nanargmax(a, axis=None): fill_value = dtypes.get_neg_infinity(a.dtype) if a.dtype.kind == "O": return _nan_argminmax_object("argmax", fill_value, a, axis=axis) a, mask = _replace_nan(a, fill_value) if isinstance(a, dask_array_type): res = dask_array.argmax(a, axis=axis) else: res = np.argmax(a, axis=axis) if mask is not None: mask = mask.all(axis=axis) if mask.any(): raise ValueError("All-NaN slice encountered") return res
async def _predict_async(self, data, output_margin=False, base_margin=None): test_dmatrix = await DaskDMatrix( client=self.client, data=data, base_margin=base_margin, missing=self.missing ) pred_probs = await predict(client=self.client, model=self.get_booster(), data=test_dmatrix, output_margin=output_margin) if self.n_classes_ == 2: preds = (pred_probs > 0.5).astype(int) else: preds = da.argmax(pred_probs, axis=1) return preds
def nanargmax(a, axis=None): fill_value = dtypes.get_neg_infinity(a.dtype) if a.dtype.kind == 'O': return _nan_argminmax_object('argmax', fill_value, a, axis=axis) a, mask = _replace_nan(a, fill_value) if isinstance(a, dask_array_type): res = dask_array.argmax(a, axis=axis) else: res = np.argmax(a, axis=axis) if mask is not None: mask = mask.all(axis=axis) if mask.any(): raise ValueError("All-NaN slice encountered") return res
def test_linspace(endpoint): darr = da.linspace(6, 49, endpoint=endpoint, chunks=5) nparr = np.linspace(6, 49, endpoint=endpoint) assert_eq(darr, nparr) darr = da.linspace(1.4, 4.9, endpoint=endpoint, chunks=5, num=13) nparr = np.linspace(1.4, 4.9, endpoint=endpoint, num=13) assert_eq(darr, nparr) darr = da.linspace(6, 49, endpoint=endpoint, chunks=5, dtype=float) nparr = np.linspace(6, 49, endpoint=endpoint, dtype=float) assert_eq(darr, nparr) darr, dstep = da.linspace(6, 49, endpoint=endpoint, chunks=5, retstep=True) nparr, npstep = np.linspace(6, 49, endpoint=endpoint, retstep=True) assert np.allclose(dstep, npstep) assert_eq(darr, nparr) darr = da.linspace(1.4, 4.9, endpoint=endpoint, chunks=5, num=13, dtype=int) nparr = np.linspace(1.4, 4.9, num=13, endpoint=endpoint, dtype=int) assert_eq(darr, nparr) assert sorted( da.linspace(1.4, 4.9, endpoint=endpoint, chunks=5, num=13).dask) == sorted( da.linspace(1.4, 4.9, endpoint=endpoint, chunks=5, num=13).dask) assert sorted( da.linspace(6, 49, endpoint=endpoint, chunks=5, dtype=float).dask) == sorted( da.linspace(6, 49, endpoint=endpoint, chunks=5, dtype=float).dask) x = da.array([0.2, 6.4, 3.0, 1.6]) nparr = np.linspace(0, 2, 8, endpoint=endpoint) darr = da.linspace(da.argmin(x), da.argmax(x) + 1, 8, endpoint=endpoint) assert_eq(darr, nparr)
def select_best_model(models, data): """ Given several models and data to fit, return the best model based on Grid Search Cross Validation :param models: An array of dicts containing the name for the model ("name"), the sklearn estimator ("model"), and the parameters for Grid Search Cross Validation ("params") :param data: a pandas dataframe where each row is an hour :return: an sklearn estimator that is the best model refit on the whole train data """ results = [grid_search_model(data, model) for model in models] best_index = da.argmax([r[0] for r in results]) best_model = results[best_index][2] print("\nBest model: {0} with params {1}".format(model_name(best_model), results[best_index][1])) return best_model
def test_reductions(): x = np.arange(5).astype('f4') a = da.from_array(x, chunks=(2,)) assert eq(da.all(a), np.all(x)) assert eq(da.any(a), np.any(x)) assert eq(da.argmax(a, axis=0), np.argmax(x, axis=0)) assert eq(da.argmin(a, axis=0), np.argmin(x, axis=0)) assert eq(da.max(a), np.max(x)) assert eq(da.mean(a), np.mean(x)) assert eq(da.min(a), np.min(x)) assert eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0)) assert eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0)) assert eq(da.nanmax(a), np.nanmax(x)) assert eq(da.nanmin(a), np.nanmin(x)) assert eq(da.nansum(a), np.nansum(x)) assert eq(da.nanvar(a), np.nanvar(x)) assert eq(da.nanstd(a), np.nanstd(x))
def test_reductions(): x = np.arange(5).astype('f4') a = da.from_array(x, blockshape=(2, )) assert eq(da.all(a), np.all(x)) assert eq(da.any(a), np.any(x)) assert eq(da.argmax(a, axis=0), np.argmax(x, axis=0)) assert eq(da.argmin(a, axis=0), np.argmin(x, axis=0)) assert eq(da.max(a), np.max(x)) assert eq(da.mean(a), np.mean(x)) assert eq(da.min(a), np.min(x)) assert eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0)) assert eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0)) assert eq(da.nanmax(a), np.nanmax(x)) assert eq(da.nanmin(a), np.nanmin(x)) assert eq(da.nansum(a), np.nansum(x)) assert eq(da.nanvar(a), np.nanvar(x)) assert eq(da.nanstd(a), np.nanstd(x))
def first_cue_time(data: Dict[str, da.Array], message: int) -> Optional[da.Array]: """ Find the timestamp of the first instance of a cue message in a Tristan data set. Args: data: A LATRD data dictionary (a dictionary with data set names as keys and Dask arrays as values). Must contain one entry for cue id messages and one for cue timestamps. The two arrays are assumed to have the same length. message: The message code, as defined in the Tristan standard. Returns: The timestamp, measured in clock cycles from the global synchronisation signal. If the message doesn't exist in the data set, this returns None. """ index = da.argmax(data[cue_id_key] == message) if index or data[cue_id_key][0] == message: return data[cue_time_key][index]
def subcount_forecast(data, feature): """ Creates a new a column that is the predicted value of the input feature Essentially an abstraction for 'prediction_forecasts' :param data: a pandas dataframe where each row is an hour :param feature: a String containing the feature that should be forecasted (one of: casual, registered) :return: a pandas dataframe containing the new column """ var_name = feature + "_forecast" print("\tAdding {} variable...".format(var_name)) df = dd.get_dummies(data.copy().drop("cnt", axis=1)) to_predict = dd.read_csv(PATH)[feature] df[feature] = to_predict train = get_train(df) model = RandomForestRegressor(random_state=SEED) model_params = {"n_estimators": list(range(10, 110, 10))} #tscv = TimeSeriesSplit(n_splits=5) grid_search = GridSearchCV(estimator=model, param_grid=model_params, scoring="r2", cv=None, refit=True) grid_search.fit(train.drop(feature, axis=1), train[feature]) print("\t\tPredictions for GridSearchCV on {}: {:.5f} +/- {:.5f}".format( feature, grid_search.best_score_, grid_search.cv_results_["std_test_score"][da.argmax( grid_search.cv_results_["mean_test_score"])])) data[var_name] = grid_search.best_estimator_.predict( dd.get_dummies(data.drop("cnt", axis=1))) return data
def _stage_2( YP: Array, X: Array, Y: Array, alphas: Optional[NDArray] = None, normalize: bool = True, _glow_adj_alpha: bool = False, _glow_adj_scaling: bool = False, ) -> Tuple[Array, Array]: """Stage 2 - WGR Meta Regression This stage will train separate ridge regression models for each outcome using the predictions from stage 1 for that same outcome as features. These predictions are then evaluated based on R2 score to determine an optimal "meta" estimator (see `_stage_1` for the "base" estimator description). Results then include only predictions and coefficients from this optimal model. For more details, see the level 1 regression model described in step 1 of [Mbatchou et al. 2020](https://www.biorxiv.org/content/10.1101/2020.06.19.162354v2). """ assert YP.ndim == 4 assert X.ndim == 2 assert Y.ndim == 2 # Check that chunking across samples is the same for all arrays assert YP.numblocks[2] == X.numblocks[0] == Y.numblocks[0] assert YP.chunks[2] == X.chunks[0] == Y.chunks[0] # Assert single chunks for covariates and outcomes assert X.numblocks[1] == Y.numblocks[1] == 1 # Extract shape statistics n_variant_block, n_alpha_1 = YP.shape[:2] n_sample_block = Y.numblocks[0] n_sample, n_outcome = Y.shape n_covar = X.shape[1] n_indvar = n_covar + n_variant_block * n_alpha_1 sample_chunks = Y.chunks[0] if normalize: assert_block_shape(YP, n_variant_block, 1, n_sample_block, 1) assert_chunk_shape(YP, 1, n_alpha_1, sample_chunks[0], n_outcome) # See: https://github.com/projectglow/glow/issues/260 if _glow_adj_scaling: YP = da.map_blocks( lambda x: (x - x.mean(axis=2, keepdims=True)) / x.std(axis=2, keepdims=True), YP, ) else: YP = (YP - YP.mean(axis=2, keepdims=True)) / YP.std(axis=2, keepdims=True) # Tranpose for refit on level 1 predictions YP = YP.transpose((3, 2, 0, 1)) assert_array_shape(YP, n_outcome, n_sample, n_variant_block, n_alpha_1) if alphas is None: # See: https://github.com/projectglow/glow/issues/255 if _glow_adj_alpha: alphas = get_alphas(n_variant_block * n_alpha_1 * n_outcome) else: alphas = get_alphas(n_variant_block * n_alpha_1) n_alpha_2 = alphas.size YR = [] BR = [] for i in range(n_outcome): # Slice and reshape to new 2D covariate matrix; # The order of raveling in trailing dimensions is important # and later reshapes will assume variants, alphas order XPB = YP[i].reshape((n_sample, n_variant_block * n_alpha_1)) # Prepend covariates and chunk along first dim only XPB = da.concatenate((X, XPB), axis=1) XPB = XPB.rechunk(chunks=(None, -1)) assert_array_shape(XPB, n_sample, n_indvar) assert XPB.numblocks == (n_sample_block, 1) # Extract outcome vector YB = Y[:, [i]] assert XPB.ndim == YB.ndim == 2 # Fit and predict folds for each parameter BB, YPB = _ridge_regression_cv(XPB, YB, alphas, n_zero_reg=n_covar)[-2:] assert_array_shape(BB, n_alpha_2, n_sample_block * n_indvar, 1) assert_array_shape(YPB, n_alpha_2, n_sample, 1) BR.append(BB) YR.append(YPB) # Concatenate predictions along outcome dimension YR = da.concatenate(YR, axis=2) assert_block_shape(YR, 1, n_sample_block, n_outcome) assert_chunk_shape(YR, n_alpha_2, sample_chunks[0], 1) assert_array_shape(YR, n_alpha_2, n_sample, n_outcome) # Move samples to last dim so all others are batch # dims for R2 calculations YR = da.transpose(YR, (0, 2, 1)) assert_array_shape(YR, n_alpha_2, n_outcome, n_sample) YR = YR.rechunk((-1, -1, None)) assert_block_shape(YR, 1, 1, n_sample_block) assert YR.shape[1:] == Y.T.shape # Concatenate betas along outcome dimension BR = da.concatenate(BR, axis=2) assert_block_shape(BR, 1, n_sample_block, n_outcome) assert_chunk_shape(BR, n_alpha_2, n_indvar, 1) assert_array_shape(BR, n_alpha_2, n_sample_block * n_indvar, n_outcome) # Compute R2 scores within each sample block for each outcome + alpha R2 = da.stack( [ r2_score(YR.blocks[..., i], Y.T.blocks[..., i]) # Avoid warnings on R2 calculations for blocks with single rows if YR.chunks[-1][i] > 1 else da.full(YR.shape[:-1], np.nan) for i in range(n_sample_block) ] ) assert_array_shape(R2, n_sample_block, n_alpha_2, n_outcome) # Coerce to finite or nan before nan-aware mean R2 = da.where(da.isfinite(R2), R2, np.nan) # Find highest mean alpha score for each outcome across blocks R2M = da.nanmean(R2, axis=0) assert_array_shape(R2M, n_alpha_2, n_outcome) # Identify index for the alpha value with the highest mean score R2I = da.argmax(R2M, axis=0) assert_array_shape(R2I, n_outcome) # Choose the predictions corresponding to the model with best score YRM = da.stack([YR[R2I[i], i, :] for i in range(n_outcome)], axis=-1) YRM = YRM.rechunk((None, -1)) assert_block_shape(YRM, n_sample_block, 1) assert_chunk_shape(YRM, sample_chunks[0], n_outcome) assert_array_shape(YRM, n_sample, n_outcome) # Choose the betas corresponding to the model with the best score BRM = da.stack([BR[R2I[i], :, i] for i in range(n_outcome)], axis=-1) BRM = BRM.rechunk((None, -1)) assert_block_shape(BRM, n_sample_block, 1) assert_chunk_shape(BRM, n_indvar, n_outcome) assert_array_shape(BRM, n_sample_block * n_indvar, n_outcome) return BRM, YRM