def test_set_default_device_cpu(self): ht.use_device("cpu") self.assertIs(ht.get_device(), ht.cpu) ht.use_device(ht.cpu) self.assertIs(ht.get_device(), ht.cpu) ht.use_device(None) self.assertIs(ht.get_device(), ht.cpu) with self.assertRaises(ValueError): ht.use_device("fpu") with self.assertRaises(ValueError): ht.use_device(1)
def test_set_default_device_gpu(self): if ht.torch.cuda.is_available(): ht.use_device("gpu") self.assertIs(ht.get_device(), ht.gpu) ht.use_device(ht.gpu) self.assertIs(ht.get_device(), ht.gpu) ht.use_device(None) self.assertIs(ht.get_device(), ht.gpu) with self.assertRaises(ValueError): ht.use_device("fpu") with self.assertRaises(ValueError): ht.use_device(1)
def test_asarray(self): # same heat array arr = ht.array([1, 2]) self.assertTrue(ht.asarray(arr) is arr) # from distributed python list arr = ht.array([1, 2, 3, 4, 5, 6], split=0) lst = arr.tolist(keepsplit=True) asarr = ht.asarray(lst, is_split=0) self.assertEqual(asarr.shape, arr.shape) self.assertEqual(asarr.split, 0) self.assertEqual(asarr.device, ht.get_device()) self.assertTrue(ht.equal(asarr, arr)) # from numpy array arr = np.array([1, 2, 3, 4]) asarr = ht.asarray(arr) self.assertTrue(np.alltrue(np.equal(asarr.numpy(), arr))) asarr[0] = 0 if asarr.device == ht.cpu: self.assertEqual(asarr.numpy()[0], arr[0]) # from torch tensor arr = torch.tensor([1, 2, 3, 4], device=self.device.torch_device) asarr = ht.asarray(arr) self.assertTrue(torch.equal(asarr.larray, arr)) asarr[0] = 0 self.assertEqual(asarr.larray[0].item(), arr[0].item())
def train(model, device, optimizer, target, batches=20, scaler=None): model.train() optimizer.last_batch = batches - 1 loss_fn = torch.nn.MSELoss() torch.random.manual_seed(10) data = torch.rand(batches, 2, 1, 32, 32, device=ht.get_device().torch_device) for b in range(batches): d, t = data[b].to(device), target[b].to(device) optimizer.zero_grad() if scaler is not None: with torch.cuda.amp.autocast(): output = model(d) loss = loss_fn(output, t) ret_loss = loss.clone().detach() scaler.scale(loss).backward() else: output = model(d) loss = loss_fn(output, t) ret_loss = loss.clone().detach() loss.backward() optimizer.step() return ret_loss
def test_set_default_device(self): if os.environ.get("DEVICE") == "gpu": ht.use_device("gpu") self.assertIs(ht.get_device(), ht.gpu) ht.use_device(ht.gpu) self.assertIs(ht.get_device(), ht.gpu) ht.use_device(None) self.assertIs(ht.get_device(), ht.gpu) else: ht.use_device("cpu") self.assertIs(ht.get_device(), ht.cpu) ht.use_device(ht.cpu) self.assertIs(ht.get_device(), ht.cpu) ht.use_device(None) self.assertIs(ht.get_device(), ht.cpu) with self.assertRaises(ValueError): ht.use_device("fpu") with self.assertRaises(ValueError): ht.use_device(1)
import torch import unittest import os import heat as ht if os.environ.get("DEVICE") == "gpu" and torch.cuda.is_available(): ht.use_device("gpu") torch.cuda.set_device(torch.device(ht.get_device().torch_device)) else: ht.use_device("cpu") device = ht.get_device().torch_device ht_device = None if os.environ.get("DEVICE") == "lgpu" and torch.cuda.is_available(): device = ht.gpu.torch_device ht_device = ht.gpu torch.cuda.set_device(device) class TestLogical(unittest.TestCase): def test_all(self): array_len = 9 # check all over all float elements of 1d tensor locally ones_noaxis = ht.ones(array_len, device=ht_device) x = (ones_noaxis == 1).all() self.assertIsInstance(x, ht.DNDarray) self.assertEqual(x.shape, (1, )) self.assertEqual(x.lshape, (1, )) self.assertEqual(x.dtype, ht.bool) self.assertEqual(x._DNDarray__array.dtype, torch.bool)
def test_data_parallel(self): import heat.nn.functional as F with self.assertRaises(TypeError): ht.utils.data.datatools.DataLoader("asdf") class Model(ht.nn.Module): def __init__(self): super(Model, self).__init__() # 1 input image channel, 6 output channels, 3x3 square convolution # kernel self.conv1 = ht.nn.Conv2d(1, 6, 3) self.conv2 = ht.nn.Conv2d(6, 16, 3) # an affine operation: y = Wx + b self.fc1 = ht.nn.Linear(16 * 6 * 6, 120) # 6*6 from image dimension self.fc2 = ht.nn.Linear(120, 84) self.fc3 = ht.nn.Linear(84, 10) def forward(self, x): # Max pooling over a (2, 2) window x = self.conv1(x) x = F.max_pool2d(F.relu(x), (2, 2)) # If the size is a square you can only specify a single number x = F.max_pool2d(F.relu(self.conv2(x)), 2) x = x.view(-1, self.num_flat_features(x)) x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) x = self.fc3(x) return x def num_flat_features(self, x): size = x.size()[1:] # all dimensions except the batch dimension num_features = 1 for s in size: num_features *= s return num_features class TestDataset(ht.utils.data.Dataset): def __init__(self, array, ishuffle): super(TestDataset, self).__init__(array, ishuffle=ishuffle) def __getitem__(self, item): return self.data[item] def Ishuffle(self): if not self.test_set: ht.utils.data.dataset_ishuffle(self, attrs=[["data", None]]) def Shuffle(self): if not self.test_set: ht.utils.data.dataset_shuffle(self, attrs=[["data", None]]) # create model and move it to GPU with id rank model = Model() optimizer = ht.optim.SGD(model.parameters(), lr=0.001) with self.assertRaises(TypeError): ht.optim.DataParallelOptimizer(optimizer, "asdf") dp_optimizer = ht.optim.DataParallelOptimizer(optimizer, True) ht.random.seed(1) torch.random.manual_seed(1) labels = torch.randn((2, 10), device=ht.get_device().torch_device) data = ht.random.rand(2 * ht.MPI_WORLD.size, 1, 32, 32, split=0) dataset = TestDataset(data, ishuffle=True) dataloader = ht.utils.data.datatools.DataLoader(dataset=dataset, batch_size=2) # there is only 1 batch on each process (data size[0] is 2 * number of processes, and the batch size is 2) self.assertTrue(len(dataloader) == 1) ht_model = ht.nn.DataParallel( model, data.comm, dp_optimizer, blocking_parameter_updates=True ) if str(ht.get_device())[:3] == "gpu": ht_model.to(ht.get_device().torch_device) lim = 1e-4 loss_fn = torch.nn.MSELoss() for _ in range(2): for data in dataloader: self.assertEqual(data.shape[0], 2) dp_optimizer.zero_grad() ht_outputs = ht_model(data) loss_fn(ht_outputs, labels).backward() dp_optimizer.step() for p in ht_model.parameters(): p0dim = p.shape[0] hld = ht.resplit(ht.array(p, is_split=0))._DNDarray__array hld_list = [hld[i * p0dim : (i + 1) * p0dim] for i in range(ht.MPI_WORLD.size - 1)] for i in range(1, len(hld_list)): self.assertTrue(torch.allclose(hld_list[0], hld_list[i], rtol=lim, atol=lim)) model = Model() optimizer = ht.optim.SGD(model.parameters(), lr=0.001) dp_optimizer = ht.optim.DataParallelOptimizer(optimizer, False) labels = torch.randn((2, 10), device=ht.get_device().torch_device) data = ht.random.rand(2 * ht.MPI_WORLD.size, 1, 32, 32, split=0) dataset = ht.utils.data.Dataset(data, ishuffle=False) dataloader = ht.utils.data.datatools.DataLoader(dataset=dataset, batch_size=2) ht_model = ht.nn.DataParallel( model, data.comm, dp_optimizer, blocking_parameter_updates=False ) if str(ht.get_device())[:3] == "gpu": ht_model.to(ht.get_device().torch_device) with self.assertRaises(TypeError): ht.nn.DataParallel(model, data.comm, "asdf") loss_fn = torch.nn.MSELoss() for _ in range(2): for data in dataloader: self.assertEqual(data.shape[0], 2) dp_optimizer.zero_grad() ht_outputs = ht_model(data) loss_fn(ht_outputs, labels).backward() dp_optimizer.step() for p in ht_model.parameters(): p0dim = p.shape[0] hld = ht.resplit(ht.array(p, is_split=0))._DNDarray__array hld_list = [hld[i * p0dim : (i + 1) * p0dim] for i in range(ht.MPI_WORLD.size - 1)] for i in range(1, len(hld_list)): self.assertTrue(torch.allclose(hld_list[0], hld_list[i], rtol=lim, atol=lim)) model = Model() optimizer = ht.optim.SGD(model.parameters(), lr=0.001) dp_optimizer = ht.optim.DataParallelOptimizer(optimizer, False) labels = torch.randn((2, 10), device=ht.get_device().torch_device) data = ht.random.rand(2 * ht.MPI_WORLD.size, 1, 32, 32, split=0) dataset = ht.utils.data.Dataset(data, ishuffle=True) dataloader = ht.utils.data.datatools.DataLoader(dataset=dataset, batch_size=2) ht_model = ht.nn.DataParallel( model, data.comm, dp_optimizer, blocking_parameter_updates=False ) if str(ht.get_device())[:3] == "gpu": ht_model.to(ht.get_device().torch_device) for _ in range(2): for data in dataloader: self.assertEqual(data.shape[0], 2) dp_optimizer.zero_grad() ht_outputs = ht_model(data) loss_fn(ht_outputs, labels).backward() dp_optimizer.step() for p in ht_model.parameters(): p0dim = p.shape[0] hld = ht.resplit(ht.array(p, is_split=0))._DNDarray__array hld_list = [hld[i * p0dim : (i + 1) * p0dim] for i in range(ht.MPI_WORLD.size - 1)] for i in range(1, len(hld_list)): self.assertTrue(torch.allclose(hld_list[0], hld_list[i], rtol=lim, atol=lim)) with self.assertWarns(Warning): ht_model = ht.nn.DataParallel( model, ht.MPI_WORLD, [dp_optimizer, dp_optimizer], blocking_parameter_updates=False ) # NOTE: this will throw a warning: this is expected self.assertTrue(ht_model.blocking_parameter_updates)
def test_get_default_device_gpu(self): if ht.torch.cuda.is_available(): self.assertIs(ht.get_device(), ht.gpu)
def test_get_default_device_cpu(self): self.assertIs(ht.get_device(), ht.cpu)
def test_get_default_device(self): if os.environ.get("DEVICE") == "gpu": ht.use_device(os.environ.get("DEVICE")) self.assertIs(ht.get_device(), ht.gpu) else: self.assertIs(ht.get_device(), ht.cpu)
def test_daso(self): import heat.nn.functional as F import heat.optim as optim class Model(ht.nn.Module): def __init__(self): super(Model, self).__init__() self.conv1 = ht.nn.Conv2d(1, 6, 3) self.conv2 = ht.nn.Conv2d(6, 16, 3) self.fc1 = ht.nn.Linear(16 * 6 * 6, 120) self.fc2 = ht.nn.Linear(120, 84) self.fc3 = ht.nn.Linear(84, 10) def forward(self, x): x = self.conv1(x) x = F.max_pool2d(F.relu(x), (2, 2)) x = F.max_pool2d(F.relu(self.conv2(x)), 2) x = x.view(-1, self.num_flat_features(x)) x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) x = self.fc3(x) return x @staticmethod def num_flat_features(x): size = x.size()[ 1:] # all dimensions except the batch dimension num_features = 1 for s in size: num_features *= s return num_features class TestDataset(ht.utils.data.Dataset): def __init__(self, array, ishuffle): super(TestDataset, self).__init__(array, ishuffle=ishuffle) def __getitem__(self, item): return self.data[item] def Ishuffle(self): if not self.test_set: ht.utils.data.dataset_ishuffle(self, attrs=[["data", None]]) def Shuffle(self): if not self.test_set: ht.utils.data.dataset_shuffle(self, attrs=[["data", None]]) def train(model, device, optimizer, target, batches=20, scaler=None): model.train() optimizer.last_batch = batches - 1 loss_fn = torch.nn.MSELoss() torch.random.manual_seed(10) data = torch.rand(batches, 2, 1, 32, 32, device=ht.get_device().torch_device) for b in range(batches): d, t = data[b].to(device), target[b].to(device) optimizer.zero_grad() if scaler is not None: with torch.cuda.amp.autocast(): output = model(d) loss = loss_fn(output, t) ret_loss = loss.clone().detach() scaler.scale(loss).backward() else: output = model(d) loss = loss_fn(output, t) ret_loss = loss.clone().detach() loss.backward() optimizer.step() return ret_loss model = Model() optimizer = optim.SGD(model.parameters(), lr=0.1) envar = os.getenv("HEAT_TEST_USE_DEVICE", "cpu") if ht.MPI_WORLD.size == 1 and envar == "cpu": with self.assertRaises(TypeError): ht.optim.DASO(local_optimizer="asdf", total_epochs=1) with self.assertRaises(TypeError): ht.optim.DASO(local_optimizer=optimizer, total_epochs="aa") with self.assertRaises(TypeError): ht.optim.DASO(local_optimizer=optimizer, total_epochs=1, warmup_epochs="asdf") with self.assertRaises(TypeError): ht.optim.DASO(local_optimizer=optimizer, total_epochs=1, cooldown_epochs="asdf") with self.assertRaises(TypeError): ht.optim.DASO(local_optimizer=optimizer, total_epochs=1, scheduler="asdf") with self.assertRaises(TypeError): ht.optim.DASO(local_optimizer=optimizer, total_epochs=1, stability_level="asdf") with self.assertRaises(TypeError): ht.optim.DASO(local_optimizer=optimizer, total_epochs=1, max_global_skips="asdf") with self.assertRaises(TypeError): ht.optim.DASO(local_optimizer=optimizer, total_epochs=1, sending_chunk_size="asdf") with self.assertRaises(TypeError): ht.optim.DASO(local_optimizer=optimizer, total_epochs=1, verbose="asdf") with self.assertRaises(TypeError): ht.optim.DASO(local_optimizer=optimizer, total_epochs=1, use_mpi_groups="asdf") with self.assertRaises(TypeError): ht.optim.DASO(local_optimizer=optimizer, total_epochs=1, downcast_type="asdf") with self.assertRaises(TypeError): ht.optim.DASO(local_optimizer=optimizer, total_epochs=1, comm="asdf") with self.assertRaises(TypeError): ht.optim.DASO(local_optimizer=optimizer, total_epochs=1, local_skip_factor="asdf") with self.assertRaises(TypeError): ht.optim.DASO(local_optimizer=optimizer, total_epochs=1, skip_reduction_factor="asdf") # local_skip_factor # skip_reduction_factor with self.assertRaises(ValueError): ht.optim.DASO(local_optimizer=optimizer, total_epochs=1, downcast_type=torch.bool) with self.assertRaises(ValueError): ht.optim.DASO(local_optimizer=optimizer, total_epochs=1, warmup_epochs=-1) with self.assertRaises(ValueError): ht.optim.DASO(local_optimizer=optimizer, total_epochs=1, cooldown_epochs=-1) with self.assertRaises(ValueError): ht.optim.DASO(local_optimizer=optimizer, total_epochs=1, max_global_skips=-1) with self.assertRaises(ValueError): ht.optim.DASO(local_optimizer=optimizer, total_epochs=1, sending_chunk_size=-1) with self.assertRaises(ValueError): ht.optim.DASO(local_optimizer=optimizer, total_epochs=-1) with self.assertRaises(ValueError): ht.optim.DASO(local_optimizer=optimizer, total_epochs=1, local_skip_factor=-1) with self.assertRaises(ValueError): ht.optim.DASO(local_optimizer=optimizer, total_epochs=1, skip_reduction_factor=-1) if ht.MPI_WORLD.size != 8 or torch.cuda.device_count() == 0: # only run these tests for 2 nodes, each of which has 4 GPUs return # Training settings torch.manual_seed(1) gpus = torch.cuda.device_count() loc_rank = ht.MPI_WORLD.rank % gpus device = "cuda:" + str(loc_rank) os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = "29500" os.environ["NCCL_SOCKET_IFNAME"] = "ib" if not torch.distributed.is_initialized(): torch.distributed.init_process_group(backend="nccl", rank=loc_rank, world_size=gpus) torch.cuda.set_device(device) device = torch.device("cuda") model = Model().to(device) optimizer = optim.SGD(model.parameters(), lr=0.1) epochs = 20 daso_optimizer = ht.optim.DASO( local_optimizer=optimizer, total_epochs=epochs, max_global_skips=8, stability_level=0.9999, warmup_epochs=1, cooldown_epochs=1, verbose=True, ) dp_model = ht.nn.DataParallelMultiGPU(model, daso_optimizer) target = torch.rand((20, 2, 10), device=ht.get_device().torch_device) for epoch in range(epochs): ls = train(dp_model, device, daso_optimizer, target, batches=20) if epoch == 0: first_ls = ls daso_optimizer.epoch_loss_logic(ls) # test that the loss decreases self.assertTrue(ls < first_ls) # test if the smaller split value also works daso_optimizer.reset() epochs = 4 daso_optimizer = ht.optim.DASO( local_optimizer=optimizer, total_epochs=epochs, max_global_skips=8, stability_level=0.9999, warmup_epochs=2, cooldown_epochs=1, use_mpi_groups=False, verbose=False, downcast_type=torch.half, sending_chunk_size=61194, ) dp_model = ht.nn.DataParallelMultiGPU(model, daso_optimizer) scaler = torch.cuda.amp.GradScaler() daso_optimizer.add_scaler(scaler) for epoch in range(epochs): ls = train(dp_model, device, daso_optimizer, target, batches=20, scaler=scaler) if epoch == 0: first_ls = ls daso_optimizer.epoch_loss_logic(ls, loss_globally_averaged=True) # test that the loss decreases self.assertTrue(ls < first_ls) with self.assertRaises(ValueError): daso_optimizer._prev_params = [1, 2] daso_optimizer._gs_rcv_update_params_last_batch( current_ranks=[0, 4]) with self.assertRaises(ValueError): daso_optimizer.last_batch = None daso_optimizer.step()