def test_initialization(self): params = Parameters() workers = [Worker(0, params)] zero_tensor = torch.zeros(params.n_dimensions, dtype=np.float) update = ArtemisUpdate(params, workers) self.assertTrue(torch.equal(update.g, zero_tensor)) self.assertTrue(torch.equal(update.h, zero_tensor)) self.assertTrue(torch.equal(update.v, zero_tensor)) self.assertTrue(torch.equal(update.l, zero_tensor)) self.assertTrue(torch.equal(update.value_to_compress, zero_tensor))
def test_Diana(self): params = Diana().define(n_dimensions=DIM, nb_devices=1, quantization_param=10) params.up_learning_rate = 0.5 workers = [Worker(0, params)] workers[0].set_data(x, y) workers[0].cost_model.L = workers[0].cost_model.local_L update = ArtemisUpdate(params, workers) new_model_param = update.compute(w, 2, 2) # Check that gradients have been updated. self.assertFalse(torch.equal(update.g, zero_tensor)) self.assertFalse(torch.equal(update.v, zero_tensor)) self.assertFalse(torch.equal(update.h, zero_tensor)) self.assertTrue(torch.equal(update.H, zero_tensor)) # Checking that for the return nothing has been quantized. # there is a pb, with this test. Pass if ran with Artmis settings. self.assertTrue(torch.equal(update.value_to_compress, zero_tensor))
def test_Artemis(self): params = Artemis().define(n_dimensions=DIM, nb_devices=1, quantization_param=10) params.up_learning_rate = 0.5 workers = [Worker(0, params)] workers[0].set_data(x, y) workers[0].cost_model.L = workers[0].cost_model.local_L update = ArtemisUpdate(params, workers) update.compute(w, 2, 2) # Check that gradients have been updated. self.assertFalse(torch.equal(update.g, zero_tensor)) self.assertFalse(torch.equal(update.v, zero_tensor)) self.assertFalse(torch.equal(update.h, zero_tensor)) # Check that l has been updated. self.assertTrue(torch.equal(update.H, zero_tensor)) # Check that correct value has been compressed self.assertTrue(torch.equal(update.value_to_compress, update.g))
def test_QSGD(self): params = Qsgd().define(n_dimensions=DIM, nb_devices=1, quantization_param=10) workers = [Worker(0, params)] workers[0].set_data(x, y) workers[0].cost_model.L = workers[0].cost_model.local_L update = ArtemisUpdate(params, workers) update.compute(w, 2, 2) # Check that gradients have been updated. # Check that gradients have been updated. self.assertFalse(torch.equal(update.g, zero_tensor)) self.assertFalse(torch.equal(update.v, zero_tensor)) # Checking that no memory have been updated. self.assertTrue(torch.equal(update.h, zero_tensor)) self.assertTrue(torch.equal(update.H, zero_tensor)) # Checking that for the return nothing has been quantized. self.assertTrue(torch.equal(update.value_to_compress, zero_tensor))
def setUpClass(cls): """ get_some_resource() is slow, to avoid calling it for each test use setUpClass() and store the result as class variable """ super(TestRandomizedAlgo, cls).setUpClass() cls.cost_models = build_several_cost_model(RMSEModel, X, Y, number_of_device) cls.params = RandMCM().define(n_dimensions=dim, nb_devices=number_of_device, up_compression_model=SQuantization(1, dim), down_compression_model=SQuantization(1, dim), nb_epoch=1, cost_models=cls.cost_models, step_formula=constant_step_size) cls.params.down_learning_rate = 1 / cls.params.down_compression_model.omega_c cls.params.up_learning_rate = 1 cls.workers = [Worker(i, cls.params, LocalArtemisUpdate) for i in range(number_of_device)]
def test_doubleMODELcompression_without_memory(self): params = SGDDoubleModelCompressionWithoutMem().define( n_dimensions=DIM, nb_devices=1, quantization_param=10) params.learning_rate = 0.5 workers = [Worker(0, params)] workers[0].set_data(x, y) workers[0].cost_model.L = workers[0].cost_model.local_L update = ArtemisUpdate(params, workers) new_w = update.compute(w, 2, 2) # Check that gradients have been updated. self.assertFalse(torch.equal(update.g, zero_tensor)) self.assertFalse(torch.equal(update.v, zero_tensor)) self.assertFalse(torch.equal(update.h, zero_tensor)) # Check that l has been updated. self.assertTrue(torch.equal(update.l, zero_tensor)) # Check that correct value has been compressed self.assertTrue(torch.equal(update.value_to_compress, new_w))
def __init__(self, parameters: Parameters) -> None: """Initialization of the gradient descent. It initialize all the worker of the network, the sequence of (averaged) losses, the sequence of (averaged) models. Args: parameters: the parameters of the descent. """ super().__init__() self.parameters = parameters self.train_losses = [] self.norm_error_feedback = [] self.dist_to_model = [torch.tensor(0.)] self.var_models = [torch.tensor(0.)] self.model_params = [] self.averaged_model_params = [] self.averaged_train_losses = [] self.memory_info = None if self.parameters.use_up_memory and self.parameters.up_compression_model.omega_c != 0 and self.parameters.up_learning_rate is None: self.parameters.up_learning_rate = 1 / ( 2 * (self.parameters.up_compression_model.omega_c + 1)) elif not self.parameters.use_up_memory or self.parameters.up_compression_model.omega_c == 0: self.parameters.up_learning_rate = 0 if self.parameters.use_down_memory and self.parameters.down_compression_model.omega_c != 0 and self.parameters.down_learning_rate is None: self.parameters.down_learning_rate = 1 / ( 2 * (self.parameters.down_compression_model.omega_c + 1)) elif not self.parameters.use_down_memory or self.parameters.down_compression_model.omega_c == 0: self.parameters.down_learning_rate = 0 if self.parameters.use_up_memory: self.parameters.error_feedback_coef = 1 / ( self.parameters.up_compression_model.omega_c + 1) # Creating each worker of the network. self.workers = [ Worker(i, parameters, self.__local_update__()) for i in range(self.parameters.nb_devices) ] # Call for the update method of the gradient descent. self.update = self.__update_method__()
def test_doubleMODELcompression_WITH_memory(self): params = MCM().define(n_dimensions=DIM, nb_devices=1, quantization_param=10) params.up_learning_rate = 0.5 workers = [Worker(0, params)] workers[0].set_data(x, y) workers[0].cost_model.L = workers[0].cost_model.local_L update = ArtemisUpdate(params, workers) artificial_l = ones_tensor.clone().detach() update.H = artificial_l.clone().detach() new_w = update.compute(w, 2, 2) # Check that gradients have been updated. self.assertFalse(torch.equal(update.g, zero_tensor)) self.assertFalse(torch.equal(update.v, zero_tensor)) self.assertFalse(torch.equal(update.h, zero_tensor)) # Check that l has been updated. self.assertFalse(torch.equal(update.H, artificial_l)) # Check that correct value has been compressed self.assertTrue(torch.equal(update.value_to_compress, new_w - artificial_l))
def test_doubleGRADIENTcompression_WITH_additional_memory(self): params = DoreVariant().define(n_dimensions=DIM, nb_devices=1, quantization_param=10) params.up_learning_rate = 0.5 workers = [Worker(0, params)] workers[0].set_data(x, y) workers[0].cost_model.L = workers[0].cost_model.local_L update = ArtemisUpdate(params, workers) artificial_l = ones_tensor.clone().detach() # We artificially set different memory to check that it has impact on update computation. update.H = artificial_l.clone().detach() update.compute(w, 2, 2) # Check that gradients have been updated. self.assertFalse(torch.equal(update.g, zero_tensor)) self.assertFalse(torch.equal(update.v, zero_tensor)) self.assertFalse(torch.equal(update.h, zero_tensor)) # Check that l has been updated. self.assertFalse(torch.equal(update.H, artificial_l)) # Check that correct value has been compressed self.assertTrue(torch.equal(update.value_to_compress, update.g - artificial_l))
def __init__(self, parameters: Parameters) -> None: """Initialization of the gradient descent. It initialize all the worker of the network, the sequence of (averaged) losses, the sequence of (averaged) models. Args: parameters: the parameters of the descent. """ super().__init__() self.parameters = parameters self.losses = [] self.model_params = [] self.averaged_model_params = [] self.averaged_losses = [] self.X, self.Y = None, None if self.parameters.quantization_param != 0: self.parameters.omega_c = s_quantization_omega_c( self.parameters.n_dimensions, self.parameters.quantization_param) # If learning_rate is None, we set it to optimal value. if self.parameters.learning_rate == None: self.parameters.learning_rate = 1 / ( 2 * (self.parameters.omega_c + 1)) else: if not self.parameters.force_learning_rate: self.parameters.learning_rate *= 1 / ( 1 * (self.parameters.omega_c + 1)) # If quantization_param == 0, it means there is no compression, # which means that we don't want to "predict" values with previous one, # and thus, we put learning_rate to zero. else: self.parameters.learning_rate = 0 # Creating each worker of the network. self.workers = [ Worker(i, parameters, self.__local_update__()) for i in range(self.parameters.nb_devices) ]