def rmse(self, gt: DNDarray, yest: DNDarray) -> DNDarray: """ Root mean square error (RMSE) Parameters ---------- gt : DNDarray Input model data, Shape = (1,) yest : DNDarray Thresholded model data, Shape = (1,) """ return ht.sqrt((ht.mean((gt - yest)**2))).larray.item()
def rmse(self, gt, yest): """ Root mean square error (RMSE) Parameters ---------- gt : HeAT tensor, shape (1,) Input model data yest : HeAT tensor, shape (1,) Thresholded model data """ return ht.sqrt((ht.mean((gt - yest)**2))).larray.item()
def test_lasso(self): # ToDo: add additional tests # get some test data X = ht.load_hdf5( os.path.join(os.getcwd(), "heat/datasets/data/diabetes.h5"), dataset="x", device=ht_device, split=0, ) y = ht.load_hdf5( os.path.join(os.getcwd(), "heat/datasets/data/diabetes.h5"), dataset="y", device=ht_device, split=0, ) # normalize dataset X = X / ht.sqrt((ht.mean(X ** 2, axis=0))) m, n = X.shape # HeAT lasso instance estimator = ht.regression.lasso.Lasso(max_iter=100, tol=None) # check whether the results are correct self.assertEqual(estimator.lam, 0.1) self.assertTrue(estimator.theta is None) self.assertTrue(estimator.n_iter is None) self.assertEqual(estimator.max_iter, 100) self.assertEqual(estimator.coef_, None) self.assertEqual(estimator.intercept_, None) estimator.fit(X, y) # check whether the results are correct self.assertEqual(estimator.lam, 0.1) self.assertIsInstance(estimator.theta, ht.DNDarray) self.assertEqual(estimator.n_iter, 100) self.assertEqual(estimator.max_iter, 100) self.assertEqual(estimator.coef_.shape, (n - 1, 1)) self.assertEqual(estimator.intercept_.shape, (1,)) yest = estimator.predict(X) # check whether the results are correct self.assertIsInstance(yest, ht.DNDarray) self.assertEqual(yest.shape, (m, 1)) with self.assertRaises(ValueError): estimator.fit(X, ht.zeros((3, 3, 3))) with self.assertRaises(ValueError): estimator.fit(ht.zeros((3, 3, 3)), ht.zeros((3, 3)))
def test_lasso(self): # ToDo: add additional tests # get some test data X = ht.load_hdf5(os.path.join(os.getcwd(), "heat/datasets/data/diabetes.h5"), dataset="x") y = ht.load_hdf5(os.path.join(os.getcwd(), "heat/datasets/data/diabetes.h5"), dataset="y") # normalize dataset X = X / ht.sqrt((ht.mean(X**2, axis=0))) m, n = X.shape # HeAT lasso instance estimator = ht.core.regression.lasso.HeatLasso(max_iter=100, tol=None) # check whether the results are correct self.assertEqual(estimator.lam, 0.1) self.assertTrue(estimator.theta is None) self.assertTrue(estimator.n_iter is None) self.assertEqual(estimator.max_iter, 100) self.assertEqual(estimator.coef_, None) self.assertEqual(estimator.intercept_, None) estimator.fit(X, y) # check whether the results are correct self.assertEqual(estimator.lam, 0.1) self.assertIsInstance(estimator.theta, ht.DNDarray) self.assertEqual(estimator.n_iter, 100) self.assertEqual(estimator.max_iter, 100) self.assertEqual(estimator.coef_.shape, (n - 1, 1)) self.assertEqual(estimator.intercept_.shape, (1, )) yest = estimator.predict(X) # check whether the results are correct self.assertIsInstance(yest, ht.DNDarray) self.assertEqual(yest.shape, (m, )) X = ht.load_hdf5(os.path.join(os.getcwd(), "heat/datasets/data/diabetes.h5"), dataset="x") y = ht.load_hdf5(os.path.join(os.getcwd(), "heat/datasets/data/diabetes.h5"), dataset="y") # Now the same stuff again in PyTorch X = torch.tensor(X._DNDarray__array) y = torch.tensor(y._DNDarray__array) # normalize dataset X = X / torch.sqrt((torch.mean(X**2, 0))) m, n = X.shape estimator = ht.core.regression.lasso.PytorchLasso(max_iter=100, tol=None) # check whether the results are correct self.assertEqual(estimator.lam, 0.1) self.assertTrue(estimator.theta is None) self.assertTrue(estimator.n_iter is None) self.assertEqual(estimator.max_iter, 100) self.assertEqual(estimator.coef_, None) self.assertEqual(estimator.intercept_, None) estimator.fit(X, y) # check whether the results are correct self.assertEqual(estimator.lam, 0.1) self.assertIsInstance(estimator.theta, torch.Tensor) self.assertEqual(estimator.n_iter, 100) self.assertEqual(estimator.max_iter, 100) self.assertEqual(estimator.coef_.shape, (n - 1, 1)) self.assertEqual(estimator.intercept_.shape, (1, )) yest = estimator.predict(X) # check whether the results are correct self.assertIsInstance(yest, torch.Tensor) self.assertEqual(yest.shape, (m, )) X = ht.load_hdf5(os.path.join(os.getcwd(), "heat/datasets/data/diabetes.h5"), dataset="x") y = ht.load_hdf5(os.path.join(os.getcwd(), "heat/datasets/data/diabetes.h5"), dataset="y") # Now the same stuff again in PyTorch X = X._DNDarray__array.numpy() y = y._DNDarray__array.numpy() # normalize dataset X = X / np.sqrt((np.mean(X**2, axis=0, keepdims=True))) m, n = X.shape estimator = ht.core.regression.lasso.NumpyLasso(max_iter=100, tol=None) # check whether the results are correct self.assertEqual(estimator.lam, 0.1) self.assertTrue(estimator.theta is None) self.assertTrue(estimator.n_iter is None) self.assertEqual(estimator.max_iter, 100) self.assertEqual(estimator.coef_, None) self.assertEqual(estimator.intercept_, None) estimator.fit(X, y) # check whether the results are correct self.assertEqual(estimator.lam, 0.1) self.assertIsInstance(estimator.theta, np.ndarray) self.assertEqual(estimator.n_iter, 100) self.assertEqual(estimator.max_iter, 100) self.assertEqual(estimator.coef_.shape, (n - 1, 1)) self.assertEqual(estimator.intercept_.shape, (1, )) yest = estimator.predict(X) # check whether the results are correct self.assertIsInstance(yest, np.ndarray) self.assertEqual(yest.shape, (m, ))
import heat as ht from matplotlib import pyplot as plt from sklearn import datasets import heat.ml.regression.lasso as lasso import plotfkt # read scikit diabetes data set diabetes = datasets.load_diabetes() # load diabetes dataset from hdf5 file X = ht.load_hdf5("../../heat/datasets/data/diabetes.h5", dataset="x", split=0) y = ht.load_hdf5("../../heat/datasets/data/diabetes.h5", dataset="y", split=0) # normalize dataset #DoTO this goes into the lasso fit routine soon as issue #106 is solved X = X / ht.sqrt((ht.mean(X**2, axis=0))) # HeAT lasso instance estimator = lasso.HeatLasso(max_iter=100) # List lasso model parameters theta_list = list() # Range of lambda values lamda = np.logspace(0, 4, 10) / 10 # compute the lasso path for l in lamda: estimator.lam = l estimator.fit(X, y) theta_list.append(estimator.theta.numpy().flatten())
def __update_mean_variance(n_past, mu, var, X, sample_weight=None): """ Adapted to HeAT from scikit-learn. Compute online update of Gaussian mean and variance. Given starting sample count, mean, and variance, a new set of points X, and optionally sample weights, return the updated mean and variance. (NB - each dimension (column) in X is treated as independent -- you get variance, not covariance). Can take scalar mean and variance, or vector mean and variance to simultaneously update a number of independent Gaussians. See Chan, Golub, and LeVeque 1983 [1] Parameters ---------- n_past : int Number of samples represented in old mean and variance. If sample weights were given, this should contain the sum of sample weights represented in old mean and variance. mu : ht.tensor of shape (number of Gaussians,) Means for Gaussians in original set. var : ht.tensor of shape (number of Gaussians,) Variances for Gaussians in original set. sample_weight : ht.tensor of shape (n_samples,), optional (default=None) Weights applied to individual samples (1. for unweighted). Returns ------- total_mu : ht.tensor of shape (number of Gaussians,) Updated mean for each Gaussian over the combined set. total_var : ht.tensor of shape (number of Gaussians,) Updated variance for each Gaussian over the combined set. References ---------- [1] Chan, Tony F., Golub, Gene H., and Leveque, Randall J., "Algorithms for Computing the Sample Variance: Analysis and Recommendations", The American Statistician, 37:3, pp. 242-247, 1983 """ if X.shape[0] == 0: return mu, var # Compute (potentially weighted) mean and variance of new datapoints # TODO:Issue #351 allow weighted average across multiple axes if sample_weight is not None: n_new = float(sample_weight.sum()) new_mu = ht.average(X, axis=0, weights=sample_weight) new_var = ht.average((X - new_mu)**2, axis=0, weights=sample_weight) else: n_new = X.shape[0] new_var = ht.var(X, axis=0) new_mu = ht.mean(X, axis=0) if n_past == 0: return new_mu, new_var n_total = float(n_past + n_new) # Combine mean of old and new data, taking into consideration # (weighted) number of observations total_mu = (n_new * new_mu + n_past * mu) / n_total # Combine variance of old and new data, taking into consideration # (weighted) number of observations. This is achieved by combining # the sum-of-squared-differences (ssd) old_ssd = n_past * var new_ssd = n_new * new_var total_ssd = old_ssd + new_ssd + (n_new * n_past / n_total) * (mu - new_mu)**2 total_var = total_ssd / n_total return total_mu, total_var
def test_var(self): array_0_len = ht.MPI_WORLD.size * 2 array_1_len = ht.MPI_WORLD.size * 2 array_2_len = ht.MPI_WORLD.size * 2 # test raises x = ht.zeros((2, 3, 4)) with self.assertRaises(ValueError): x.var(axis=10) with self.assertRaises(ValueError): x.var(axis=[4]) with self.assertRaises(ValueError): x.var(axis=[-4]) with self.assertRaises(TypeError): ht.var(x, axis="01") with self.assertRaises(ValueError): ht.var(x, axis=(0, "10")) with self.assertRaises(ValueError): ht.var(x, axis=(0, 0)) with self.assertRaises(NotImplementedError): ht.var(x, ddof=2) with self.assertRaises(ValueError): ht.var(x, ddof=-2) with self.assertRaises(ValueError): ht.mean(x, axis=torch.Tensor([0, 0])) a = ht.arange(1, 5) self.assertEqual(a.var(ddof=1), 1.666666666666666) # ones dimensions = [] for d in [array_0_len, array_1_len, array_2_len]: dimensions.extend([d]) hold = list(range(len(dimensions))) hold.append(None) for split in hold: # loop over the number of dimensions of the test array z = ht.ones(dimensions, split=split) res = z.var(ddof=0) total_dims_list = list(z.shape) self.assertTrue((res == 0).all()) # loop over the different single dimensions for var for it in range(len(z.shape)): res = z.var(axis=it) self.assertTrue(ht.allclose(res, 0)) target_dims = [ total_dims_list[q] for q in range(len(total_dims_list)) if q != it ] if not target_dims: target_dims = () self.assertEqual(res.gshape, tuple(target_dims)) if z.split is None: sp = None else: sp = z.split if it > z.split else z.split - 1 if it == split: sp = None self.assertEqual(res.split, sp) if split == it: res = z.var(axis=it) self.assertTrue(ht.allclose(res, 0)) loop_list = [ ",".join(map(str, comb)) for comb in combinations(list(range(len(z.shape))), 2) ] for it in loop_list: # loop over the different combinations of dimensions for var lp_split = [int(q) for q in it.split(",")] res = z.var(axis=lp_split) self.assertTrue((res == 0).all()) target_dims = [ total_dims_list[q] for q in range(len(total_dims_list)) if q not in lp_split ] if not target_dims: target_dims = (1, ) if res.gshape: self.assertEqual(res.gshape, tuple(target_dims)) if res.split is not None: if any([split >= x for x in lp_split]): self.assertEqual(res.split, len(target_dims) - 1) else: self.assertEqual(res.split, z.split) # values for the iris dataset var measured by libreoffice calc for sp in [None, 0, 1]: iris = ht.load("heat/datasets/data/iris.csv", sep=";", split=sp) self.assertTrue( ht.allclose(ht.var(iris, bessel=True), 3.90318519755147))
def test_mean(self): array_0_len = 5 array_1_len = 5 array_2_len = 5 x = ht.zeros((2, 3, 4)) with self.assertRaises(ValueError): x.mean(axis=10) with self.assertRaises(ValueError): x.mean(axis=[4]) with self.assertRaises(ValueError): x.mean(axis=[-4]) with self.assertRaises(TypeError): ht.mean(x, axis="01") with self.assertRaises(ValueError): ht.mean(x, axis=(0, "10")) with self.assertRaises(ValueError): ht.mean(x, axis=(0, 0)) with self.assertRaises(ValueError): ht.mean(x, axis=torch.Tensor([0, 0])) a = ht.arange(1, 5) self.assertEqual(a.mean(), 2.5) # ones dimensions = [] for d in [array_0_len, array_1_len, array_2_len]: dimensions.extend([d]) hold = list(range(len(dimensions))) hold.append(None) for split in hold: # loop over the number of split dimension of the test array z = ht.ones(dimensions, split=split) res = z.mean() total_dims_list = list(z.shape) self.assertTrue((res == 1).all()) for it in range( len(z.shape) ): # loop over the different single dimensions for mean res = z.mean(axis=it) self.assertTrue((res == 1).all()) target_dims = [ total_dims_list[q] for q in range(len(total_dims_list)) if q != it ] if not target_dims: target_dims = () self.assertEqual(res.gshape, tuple(target_dims)) if z.split is None: sp = None else: sp = z.split if it > z.split else z.split - 1 if it == split: sp = None self.assertEqual(res.split, sp) loop_list = [ ",".join(map(str, comb)) for comb in combinations(list(range(len(z.shape))), 2) ] for it in loop_list: # loop over the different combinations of dimensions for mean lp_split = [int(q) for q in it.split(",")] res = z.mean(axis=lp_split) self.assertTrue((res == 1).all()) target_dims = [ total_dims_list[q] for q in range(len(total_dims_list)) if q not in lp_split ] if not target_dims: target_dims = (1, ) if res.gshape: self.assertEqual(res.gshape, tuple(target_dims)) if res.split is not None: if any([split >= x for x in lp_split]): self.assertEqual(res.split, len(target_dims) - 1) else: self.assertEqual(res.split, z.split) # values for the iris dataset mean measured by libreoffice calc ax0 = ht.array( [5.84333333333333, 3.054, 3.75866666666667, 1.19866666666667]) for sp in [None, 0, 1]: iris = ht.load("heat/datasets/data/iris.csv", sep=";", split=sp) self.assertTrue(ht.allclose(ht.mean(iris), 3.46366666666667)) self.assertTrue(ht.allclose(ht.mean(iris, axis=0), ax0))