class ModelAPPNP(torch.nn.Module): def __init__(self, K, alpha, hidden, activation, data): super(ModelAPPNP, self).__init__() self.linear_1 = Linear(data.num_features, hidden) self.conv = APPNP(K, alpha) self.linear_2 = Linear(hidden, data.num_class) if activation == "relu": self.activation = relu elif activation == "leaky_relu": self.activation = leaky_relu self.reg_params = list(self.linear_1.parameters()) + list(self.conv.parameters()) + list( self.linear_2.parameters()) def reset_parameters(self): self.linear_1.reset_parameters() self.linear_2.reset_parameters() def forward(self, data): x, edge_index, edge_weight = data.x, data.edge_index, data.edge_weight edge_index, edge_weight = dropout_adj(edge_index, edge_attr=edge_weight, p=0.8, training=self.training) x = self.linear_1(x) x = self.activation(x) x = dropout(x, p=0.5, training=self.training) x = self.conv(x, edge_index, edge_weight=edge_weight) x = self.activation(x) x = dropout(x, p=0.5, training=self.training) x = self.linear_2(x) return log_softmax(x, dim=-1)
def __init__(self, in_channels, out_channels, hiddens=[], activations=[], dropout=0.5, l2_norm=5e-5, lr=0.2, use_bias=False): super().__init__() if len(hiddens) != len(activations): raise RuntimeError( f"Arguments 'hiddens' and 'activations' should have the same length." " Or you can set both of them to `[]`.") self.layers = ModuleList() paras = [] inc = in_channels for hidden, activation in zip(hiddens, activations): layer = Linear(inc, hidden, bias=use_bias) paras.append(dict(params=layer.parameters(), weight_decay=l2_norm)) self.layers.append(layer) inc = hidden layer = Linear(inc, out_channels, bias=use_bias) self.layers.append(layer) # do not use weight_decay in the final layer paras.append(dict(params=layer.parameters(), weight_decay=l2_norm)) self.dropout = Dropout(dropout) self.optimizer = optim.Adam(paras, lr=lr) self.loss_fn = torch.nn.CrossEntropyLoss()
class ModelGAT(torch.nn.Module): def __init__(self, num_layers, hidden_list, activation, data): super(ModelGAT, self).__init__() assert len(hidden_list) == num_layers + 1 self.linear_1 = Linear(data.num_features, hidden_list[0]) self.convs = torch.nn.ModuleList() for i in range(num_layers): self.convs.append(GATConv(hidden_list[i], hidden_list[i + 1])) self.linear_2 = Linear(hidden_list[-1], data.num_class) if activation == "relu": self.activation = relu elif activation == "leaky_relu": self.activation = leaky_relu self.reg_params = list(self.linear_1.parameters()) + list( self.convs.parameters()) + list(self.linear_2.parameters()) def reset_parameters(self): self.linear_1.reset_parameters() for conv in self.convs: conv.reset_parameters() self.linear_2.reset_parameters() def forward(self, data): x, edge_index = data.x, data.edge_index x = self.linear_1(x) x = self.activation(x) x = dropout(x, p=0.5, training=self.training) for i in range(len(self.convs)): x = self.convs[i](x, edge_index) if i != len(self.convs) - 1: x = self.activation(x) x = dropout(x, p=0.5, training=self.training) x = self.linear_2(x) return log_softmax(x, dim=-1)
def _test_basic_func(rank, world_size, tempfile_name, test_case, oss, model=None): _dist_init(rank, world_size, tempfile_name, backend="nccl") if model is None: model = Linear(2, 2) model.bias.data.fill_(0.0) model.to("cuda") model = DDP(model, device_ids=[rank]) assert oss in ["none", "ada-oss", "wrapper-oss", "oss-wrapper"] if oss == "ada-oss": optim = AdaScale(OSS(model.parameters(), SGD, lr=0.1)) elif oss == "wrapper-oss": optim = AdaScaleWrapper(model.parameters(), optim_cls=OSS, optim=SGD, lr=0.1) elif oss == "oss-wrapper": optim = OSS(model.parameters(), AdaScaleWrapper, optim_cls=SGD, lr=0.1) else: assert oss == "none" optim = AdaScale(SGD(model.parameters(), lr=0.1)) if "input" in test_case: inputs = [test_case["input"]] else: inputs = test_case["inputs"] for in_data in inputs: in_data = Tensor(in_data[rank]).cuda() out = model(in_data) out.sum().backward() optim.step() optim.zero_grad() if "expected_gain" in test_case: assert np.allclose(optim.gain(), test_case["expected_gain"]), "{} vs {}".format( optim.gain(), test_case["expected_gain"]) if "expected_mean_weight" in test_case: mean_weight = mean( [model.module[i].weight.data.mean().item() for i in range(4)]) assert np.allclose(mean_weight, test_case["expected_mean_weight"]), mean_weight dist.destroy_process_group()
class SimpleDDPGAgent(Module): def __init__(self, **kwargs): super(SimpleDDPGAgent, self).__init__() hidden_size = kwargs['hidden_size'] # actor self.actor_linears = ModuleList( [Linear(kwargs['state_dim'], hidden_size[0])]) self.actor_linears.extend([ Linear(hidden_size[i - 1], hidden_size[i]) for i in range(1, len(hidden_size)) ]) self.action = Linear(hidden_size[-1], kwargs['action_dim']) # critic self.critic_linears = ModuleList([ Linear(kwargs['state_dim'] + kwargs['action_dim'], hidden_size[0]) ]) self.critic_linears.extend([ Linear(hidden_size[i - 1], hidden_size[i]) for i in range(1, len(hidden_size)) ]) self.q = Linear(hidden_size[-1], 1) self.relu = ReLU() self.sigmoid = Sigmoid() self.tanh = Tanh() self.apply(init_weights) # xavier uniform init def act(self, state): x = state for l in self.actor_linears: x = l(x) x = self.relu(x) action = self.tanh(self.action(x)) return action def Q(self, state, action): x = torch.cat([state, action], dim=1) for l in self.critic_linears: x = l(x) x = self.relu(x) q = self.q(x) return q def get_actor_parameters(self): return list(self.actor_linears.parameters()) + list( self.action.parameters()) def get_critic_parameters(self): return list(self.critic_linears.parameters()) + list( self.q.parameters())
def test_save_load_checkpoints(): experts = {} expert = Linear(1, 1) opt = torch.optim.SGD(expert.parameters(), 0.0) expert_name = f'test_expert' args_schema = (BatchTensorDescriptor(1), ) experts[expert_name] = ExpertBackend( name=expert_name, expert=expert, opt=opt, args_schema=args_schema, outputs_schema=BatchTensorDescriptor(1), max_batch_size=1, ) with TemporaryDirectory() as tmpdir: tmp_path = Path(tmpdir) expert.weight.data[0] = 1 store_experts(experts, tmp_path) expert.weight.data[0] = 2 store_experts(experts, tmp_path) expert.weight.data[0] = 3 store_experts(experts, tmp_path) checkpoints_dir = tmp_path / expert_name assert checkpoints_dir.exists() assert len(list(checkpoints_dir.iterdir())) == 3 expert.weight.data[0] = 4 load_weights(experts, tmp_path) assert expert.weight.data[0] == 3
def test_one_iteration(self): """Test FSDP with uneven divide of parameter shards.""" model = Linear(3, 3, bias=False) input = torch.rand(8, 3) my_lr = 0.1 ref_forward_output_my_rank, ref_weight_out = self._get_ref_results( model, input, my_lr ) model.to(self.rank) model = FSDP(model) optim = SGD(model.parameters(), lr=my_lr) self.assertTrue(len(input) >= self.world_size) in_data = torch.Tensor(input[self.rank]).to(self.rank) out = model(in_data) out.float().sum().backward() optim.step() optim.zero_grad() with model._summon_full_params(): torch.cuda.synchronize() # TODO: This is here because it was # originally part of get_full_params(), debug why it is needed here. weight_out = model.module.weight.T.clone() self.assertEqual(ref_forward_output_my_rank, out) self.assertEqual(ref_weight_out, weight_out)
def train(data_loader): net = Linear(3, 1) lr = 1e-3 optimizer = AdamW(net.parameters(), lr) scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=1, T_mult=2) max_epochs = 40 for ep_id in range(max_epochs): net.train() for b_id, batch in enumerate(data_loader): optimizer.zero_grad() output = net(batch['data'].to(data_loader)) CEloss = CrossEntropyLoss() loss = CEloss(output, batch['ground_truth']) writer = SummaryWriter( log_dir= 'C:\Users\andre\OneDrive\Рабочий стол\train\checkpoints', comment="Batch loss") writer.add_graph(loss) loss.backward() optimiser.step() scheduler.step() torch.save( { 'epoch': ep_id, 'model_state_dict': net.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': loss, }, '/checkpoints/checkpoints.txt')
def test_grad_accum_cpu(cpu=True): """Test the basic functionality on CPU with gradient accumulation without DDP""" model = Linear(2, 2, bias=False) if not cpu: model = model.cuda() optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2) for expected_gain in [2.0, 2.0]: # test 2 iterations catch more corner cases. # grad pass 1 in_data = Tensor([0.0, 1.0]) if not cpu: in_data = in_data.cuda() out = model(in_data) out.sum().backward() # grad pass 2 in_data = Tensor([1.0, 0.0]) if not cpu: in_data = in_data.cuda() out = model(in_data) out.sum().backward() # stepping it. Note that if we did more than 2 passes as promised by the # num_gradients_to_accumulate argument above, AdaScale is not be able to # detect that mistake for now. The result will just be wrong in that case. assert np.allclose(optim.gain(), expected_gain), optim.gain() optim.step() optim.zero_grad()
def test_grad_accum(test_case, cpu): """Test the basic functionality on CPU/GPU with gradient accumulation without DDP""" model = Linear(2, 2, bias=False) if not cpu: if torch.cuda.device_count() < 1: pytest.skip("1 GPU is required") model = model.cuda() optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2) expected_gain = test_case["expected_gain"] if "input" in test_case: data = [test_case["input"]] * 2 gains = [expected_gain] * 2 else: data = test_case["inputs"] gains = [None, expected_gain] for in_data, exp_gain in zip(data, gains): # test 2 iterations catch more corner cases. # grad pass 1 in_data_0 = Tensor(in_data[0]) if not cpu: in_data_0 = in_data_0.cuda() out = model(in_data_0) out.sum().backward() # grad pass 2 in_data_1 = Tensor(in_data[1]) if not cpu: in_data_1 = in_data_1.cuda() out = model(in_data_1) out.sum().backward() if exp_gain is not None: assert np.allclose(optim.gain(), exp_gain), optim.gain() # stepping it. Note that if we did more than 2 passes as promised by the # num_gradients_to_accumulate argument above, AdaScale is not be able to # detect that mistake for now. The result will just be wrong in that case. optim.step() optim.zero_grad()
def test_restore_update_count(): experts = {} expert = Linear(1, 1) opt = torch.optim.SGD(expert.parameters(), 0.0) expert_name = f'test_expert' args_schema = (BatchTensorDescriptor(1),) expert_backend = ExpertBackend(name=expert_name, expert=expert, opt=opt, args_schema=args_schema, outputs_schema=BatchTensorDescriptor(1), max_batch_size=1, ) experts[expert_name] = expert_backend batch = torch.randn(1, 1) loss_grad = torch.randn(1, 1) with TemporaryDirectory() as tmpdir: tmp_path = Path(tmpdir) for _ in range(BACKWARD_PASSES_BEFORE_SAVE): expert_backend.backward(batch, loss_grad) store_experts(experts, tmp_path) for _ in range(BACKWARD_PASSES_AFTER_SAVE): expert_backend.backward(batch, loss_grad) load_weights(experts, tmp_path) assert experts[expert_name].update_count == BACKWARD_PASSES_BEFORE_SAVE
def test_unhook(): """Test unhook that frees the tensor from CUDA memory.""" model = Linear(123, 456, bias=False).cuda() # unique shape so that it can be found optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2) def find_tensor(): """ Find the weight tensor from the heap Return True if found. """ for obj in gc.get_objects(): try: # Only need to check parameter type objects if "torch.nn.parameter.Parameter" not in str(type(obj)): continue if torch.is_tensor(obj) or (hasattr(obj, "data") and torch.is_tensor(obj.data)): if obj.shape == (456, 123): return True except Exception as e: pass return False torch.cuda.empty_cache() assert find_tensor( ), "something wrong with gc-based method to find the tensor" optim.unhook() del model del optim torch.cuda.empty_cache() assert not find_tensor(), "tensor should have been released"
def test_create_supervised(): model = Linear(1, 1) model.weight.data.zero_() model.bias.data.zero_() optimizer = SGD(model.parameters(), 0.1) trainer = create_supervised(model, optimizer, mse_loss) x = torch.FloatTensor([[1.0], [2.0]]) y = torch.FloatTensor([[3.0], [5.0]]) data = [(x, y)] trainer.validate(data) y_pred, y = trainer.validation_history[0] assert y_pred[0, 0] == approx(0.0) assert y_pred[1, 0] == approx(0.0) assert y[0, 0] == approx(3.0) assert y[1, 0] == approx(5.0) assert model.weight.data[0, 0] == approx(0.0) assert model.bias.data[0] == approx(0.0) trainer.run(data) loss = trainer.training_history[0] assert loss == approx(17.0) assert model.weight.data[0, 0] == approx(1.3) assert model.bias.data[0] == approx(0.8)
def _test_create_supervised_trainer( model_device: Optional[str] = None, trainer_device: Optional[str] = None, trace: bool = False, amp_mode: str = None, scaler: Union[bool, "torch.cuda.amp.GradScaler"] = False, ): model = Linear(1, 1) if model_device: model.to(model_device) model.weight.data.zero_() model.bias.data.zero_() optimizer = SGD(model.parameters(), 0.1) if trace: example_input = torch.randn(1, 1) model = torch.jit.trace(model, example_input) if amp_mode == "apex" and model_device == trainer_device == "cuda": from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level="O2") trainer = create_supervised_trainer( model, optimizer, mse_loss, device=trainer_device, output_transform=lambda x, y, y_pred, loss: (y_pred, loss.item()), amp_mode=amp_mode, scaler=scaler, ) x = torch.tensor([[0.1], [0.2]]) y = torch.tensor([[0.3], [0.5]]) data = [(x, y)] assert model.weight.data[0, 0].item() == approx(0.0) assert model.bias.item() == approx(0.0) if model_device == trainer_device or ((model_device == "cpu") ^ (trainer_device == "cpu")): state = trainer.run(data) assert state.output[-1] == approx(0.17), state.output[-1] assert round(model.weight.data[0, 0].item(), 3) == approx(0.013), model.weight.item() assert round(model.bias.item(), 3) == approx(0.08), model.bias.item() if amp_mode == "amp": assert state.output[0].dtype is torch.half if scaler and isinstance(scaler, bool): assert hasattr(state, "scaler") else: assert not hasattr(state, "scaler") else: if LooseVersion(torch.__version__) >= LooseVersion("1.7.0"): # This is broken in 1.6.0 but will be probably fixed with 1.7.0 with pytest.raises(RuntimeError, match=r"is on CPU, but expected them to be on GPU"): trainer.run(data)
def _test_basic_func(rank, world_size, tempfile_name, test_case): _dist_init(rank, world_size, tempfile_name, backend="nccl") # Covers nccl model = Linear(2, 2, bias=False) model.to("cuda") model = DDP(model, device_ids=[rank]) optim = AdaScale(SGD(model.parameters(), lr=0.1)) if "input" in test_case: # single iter in_data = Tensor(test_case["input"][rank]) in_data = in_data.cuda() out = model(in_data) out.sum().backward() assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain() optim.step() optim.zero_grad() else: # multiple iters for in_data in test_case["inputs"]: in_data = Tensor(in_data[rank]).cuda() out = model(in_data) out.sum().backward() optim.step() optim.zero_grad() assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain() dist.destroy_process_group()
def test_gradient_value(): """Test that we don't mutate the gradients during backward""" model = Linear(2, 2, bias=False) optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2) # fwd 1 out = model(Tensor([0.0, 1.0])) out.sum().backward() assert np.allclose(model.weight.grad.numpy(), [[0.0, 1.0], [0.0, 1.0]]), model.weight.grad # fwd 2, grad is accumulated out = model(Tensor([0.0, 1.0])) out.sum().backward() assert np.allclose(model.weight.grad.numpy(), [[0.0, 2.0], [0.0, 2.0]]), model.weight.grad # assert gain and grad value before/after step/zero_grad assert np.allclose(optim.gain(), 1.0000002499999376), optim.gain() optim.step() assert np.allclose(model.weight.grad.numpy(), [[0.0, 2.0], [0.0, 2.0]]), model.weight.grad optim.zero_grad() assert np.allclose(model.weight.grad.numpy(), [[0.0, 0.0], [0.0, 0.0]]), model.weight.grad
def test_save_load_checkpoints(): experts = {} expert = Linear(1, 1) opt = torch.optim.SGD(expert.parameters(), 0.0) expert_name = f'test_expert' args_schema = (BatchTensorDescriptor(1),) experts[expert_name] = ExpertBackend(name=expert_name, expert=expert, opt=opt, args_schema=args_schema, outputs_schema=BatchTensorDescriptor(1), max_batch_size=1, ) with TemporaryDirectory() as tmpdir: tmp_path = Path(tmpdir) for i in range(1, EXPERT_WEIGHT_UPDATES + 1): expert.weight.data[0] = i store_experts(experts, tmp_path) checkpoints_dir = tmp_path / expert_name assert checkpoints_dir.exists() # include checkpoint_last.pt assert len(list(checkpoints_dir.iterdir())) == EXPERT_WEIGHT_UPDATES + 1 expert.weight.data[0] = 0 load_weights(experts, tmp_path) assert expert.weight.data[0] == EXPERT_WEIGHT_UPDATES
def test_set_num_gradients_to_accumulate(test_case): """Test set_num_gradients_to_accumulate experimental feature.""" new_accum = test_case["new_accum"] exp_gain = test_case["exp_gain"] model = Linear(2, 2, bias=False) optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2) out = model(Tensor([0.0, 1.0])) out.sum().backward() out = model(Tensor([1.0, 0.0])) out.sum().backward() assert np.allclose(optim.gain(), 2.0) optim.step() optim.zero_grad() optim.set_scale(float(new_accum)) optim.set_num_gradients_to_accumulate(new_accum) for _ in range(new_accum): out = model(Tensor([0.0, 1.0])) out.sum().backward() assert np.allclose(optim.gain(), exp_gain), optim.gain() optim.step() optim.zero_grad()
def _test_grad_accum_func(rank, world_size, tempfile_name): _dist_init(rank, world_size, tempfile_name, backend="gloo") # Covers gloo model = Linear(4, 2, bias=False) model.to("cuda") model = DDP(model, device_ids=[rank]) optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2) with model.no_sync(): # iter 1, input vectors are pointing dim0 and dim1 in_data = Tensor([0.0] * 4) in_data[rank] = 1.0 in_data = in_data.cuda() out = model(in_data) out.sum().backward() # iter 2, input vectors are pointing dim2 and dim3 in_data = Tensor([0.0] * 4) in_data[rank + 2] = 1.0 in_data = in_data.cuda() out = model(in_data) out.sum().backward() # since all inputs are orthogonal, the gain should be exactly 4.0. assert np.allclose(optim.gain(), 4.0), optim.gain() optim.step() optim.zero_grad() dist.destroy_process_group()
def _test_apex_average(device, amp_mode, opt_level): assert amp_mode == "apex" assert device == "cuda" model = Linear(1, 1) if device: model.to(device) model.weight.data.zero_() model.bias.data.zero_() optimizer = SGD(model.parameters(), 0.1) from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level) mean_var = VariableAccumulation(lambda a, x: a + x) y_true = torch.rand(100).float().to(device) for y in y_true: mean_var.update(y) a, n = mean_var.compute() assert a.item() == pytest.approx(y_true.sum().item()) assert n == len(y_true)
def _test_create_mocked_supervised_trainer( model_device: Optional[str] = None, trainer_device: Optional[str] = None, trace: bool = False, amp_mode: str = None, scaler: Union[bool, "torch.cuda.amp.GradScaler"] = False, ): with mock.patch("ignite.engine.supervised_training_step_amp") as training_step_amp_mock: with mock.patch("ignite.engine.supervised_training_step_apex") as training_step_apex_mock: with mock.patch("ignite.engine.supervised_training_step_tpu") as training_step_tpu_mock: with mock.patch("ignite.engine.supervised_training_step") as training_step_mock: model = Linear(1, 1) if model_device: model.to(model_device) model.weight.data.zero_() model.bias.data.zero_() optimizer = SGD(model.parameters(), 0.1) if trace: example_input = torch.randn(1, 1) model = torch.jit.trace(model, example_input) if amp_mode == "apex" and model_device == trainer_device == "cuda": from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level="O2") trainer = create_supervised_trainer( model, optimizer, mse_loss, device=trainer_device, output_transform=lambda x, y, y_pred, loss: (y_pred, loss.item()), amp_mode=amp_mode, scaler=scaler, ) x = torch.tensor([[0.1], [0.2]]) y = torch.tensor([[0.3], [0.5]]) data = [(x, y)] assert model.weight.data[0, 0].item() == approx(0.0) assert model.bias.item() == approx(0.0) on_tpu = "xla" in trainer_device if trainer_device is not None else False mode, _ = _check_arg(on_tpu, amp_mode, scaler) if model_device == trainer_device or ((model_device == "cpu") ^ (trainer_device == "cpu")): trainer.run(data) if mode == "amp": assert training_step_amp_mock.called elif mode == "apex": assert training_step_apex_mock.called elif mode == "tpu": assert training_step_tpu_mock.called else: assert training_step_mock.called
def test_basic_cpu(): """Test single batch behavior on CPU""" model = Linear(2, 2, bias=False) try: optim = AdaScale(SGD(model.parameters(), lr=0.1)) except RuntimeError: return assert False, "Single batch AdaScale should not be suppported"
class ModelGCN(torch.nn.Module): def __init__(self, num_layers, hidden_list, activation, data): super(ModelGCN, self).__init__() assert len(hidden_list) == num_layers + 1 self.linear_1 = Linear(data.num_features, hidden_list[0]) self.convs = torch.nn.ModuleList() for i in range(num_layers): self.convs.append(GCNConv(hidden_list[i], hidden_list[i + 1])) self.JK = JumpingKnowledge(mode='max') self.linear_2 = Linear(hidden_list[-1], data.num_class) if activation == "relu": self.activation = relu elif activation == "leaky_relu": self.activation = leaky_relu self.reg_params = list(self.linear_1.parameters()) + list( self.convs.parameters()) + list(self.JK.parameters()) + list( self.linear_2.parameters()) def reset_parameters(self): self.linear_1.reset_parameters() for conv in self.convs: conv.reset_parameters() self.linear_2.reset_parameters() def forward(self, data): x, edge_index, edge_weight = data.x, data.edge_index, data.edge_weight edge_index, edge_weight = dropout_adj(edge_index, edge_attr=edge_weight, p=0.8, training=self.training) x_jk = [] x = self.linear_1(x) x = self.activation(x) x_jk.append(dropout(x, p=0.5, training=self.training)) for i in range(len(self.convs)): x = self.convs[i](x_jk[-1], edge_index, edge_weight=edge_weight) if i != len(self.convs) - 1: x_jk.append(self.activation(x)) else: x_jk.append(dropout(x, p=0.5, training=self.training)) x = self.JK(x_jk) x = self.linear_2(x) return log_softmax(x, dim=-1)
def _test_basic_func(rank, ddp_cls, world_size, tempfile_name, test_case): _dist_init(rank, world_size, tempfile_name, backend="nccl") # Covers nccl model = Linear(2, 2) model.to("cuda") if ddp_cls is DDP: model = ddp_cls(model, device_ids=[rank]) optim = AdaScale(SGD(model.parameters(), lr=0.1)) elif ddp_cls is SDP: optim = AdaScale(OSS(model.parameters(), SGD, lr=0.1)) model = ddp_cls(model, sharded_optimizer=optim) else: assert ddp_cls is FSDP, ddp_cls # Two cases: # flatten=True : AdaScale wrapper must be after FSDP and it receives # a single grad tensor. It won't receive grad if # wrapped before. # flatten=False: AdaScale can be both before or after FSDP. # So, it is better to do AdaScale after FSDP. model = ddp_cls(model, flatten_parameters=False) optim = AdaScale(SGD(model.parameters(), lr=0.1)) if "input" in test_case: # single iter in_data = Tensor(test_case["input"][rank]) in_data = in_data.cuda() out = model(in_data) out.sum().backward() if ddp_cls is DDP: assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain() optim.step() optim.zero_grad() else: # multiple iters for in_data in test_case["inputs"]: in_data = Tensor(in_data[rank]).cuda() out = model(in_data) out.sum().backward() optim.step() optim.zero_grad() if ddp_cls is DDP: assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain() dist.destroy_process_group()
def test_zero_model(self): model = Linear(3, 1) init.constant_(model.weight, 0) init.constant_(model.bias, 0) optim = torch.optim.SGD(model.parameters(), lr=0.01) trial = torchbearer.Trial(model, optim, loss) trial.with_test_data(torch.rand(10, 3), batch_size=3) preds = trial.predict() for i in range(len(preds)): self.assertAlmostEqual(preds[i], 0)
def test_debias_ewma(): """Test debias_ewma experimental feature""" model = Linear(2, 2, bias=False) optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2, debias_ewma=True) for _ in range(4): out = model(Tensor([0.0, 1.0])) out.sum().backward() out = model(Tensor([1.0, 0.0])) out.sum().backward() assert np.allclose(optim.gain(), 2.0), optim.gain() optim.step() optim.zero_grad()
def _test_basic_func(rank, world_size, tempfile_name, test_case, oss, model=None): _dist_init(rank, world_size, tempfile_name, backend="nccl") if model is None: model = Linear(2, 2, bias=False) model.to("cuda") model = DDP(model, device_ids=[rank]) if oss: # For now, we can only wrap AdaScale over OSS. If we do it the other way around, # AdaScale needs to take different parameter types, i.e. the parameter list, etc. optim = AdaScale(OSS(model.parameters(), SGD, lr=0.1)) else: optim = AdaScale(SGD(model.parameters(), lr=0.1)) if "input" in test_case: inputs = [test_case["input"]] else: inputs = test_case["inputs"] for in_data in inputs: in_data = Tensor(in_data[rank]).cuda() out = model(in_data) out.sum().backward() optim.step() optim.zero_grad() assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain() if "expected_mean_weight" in test_case: mean_weight = mean( [model.module[i].weight.data.mean().item() for i in range(4)]) assert np.allclose(mean_weight, test_case["expected_mean_weight"]), mean_weight dist.destroy_process_group()
def test_mc_loss(): num_batches = 2 num_classes = 4 chi = 2 # 4 classes x = torch.ones(num_batches, chi * num_classes) x[:, 0, ...] = 10 target = torch.zeros(num_batches, dtype=torch.long) mod = Linear(chi * num_classes, chi * num_classes) # Check backprop for reduction in ['mean', 'sum', 'none']: for p in mod.parameters(): p.grad = None train_loss = F.mutual_channel_loss(mod(x), target, ignore_index=0, reduction=reduction) if reduction == 'none': assert train_loss.shape == (num_batches, ) train_loss = train_loss.sum() train_loss.backward() assert isinstance(mod.weight.grad, torch.Tensor) # Check type casting of weights for p in mod.parameters(): p.grad = None class_weights = torch.ones(num_classes, dtype=torch.float16) ignore_index = 0 criterion = nn.MutualChannelLoss(weight=class_weights, ignore_index=ignore_index, chi=chi) train_loss = criterion(mod(x), target) train_loss.backward() assert isinstance(mod.weight.grad, torch.Tensor) assert repr(criterion ) == f"MutualChannelLoss(reduction='mean', chi={chi}, alpha=1)"
def __init__(self, in_channels, out_channels, hiddens=[], activations=[], dropout=0.5, weight_decay=5e-5, lr=0.2, use_bias=False): super().__init__() if len(hiddens) != len(activations): raise RuntimeError( f"Arguments 'hiddens' and 'activations' should have the same length." " Or you can set both of them to `[]`.") layers = ModuleList() acts = [] paras = [] inc = in_channels for hidden, activation in zip(hiddens, activations): layer = Linear(inc, hidden, bias=use_bias) paras.append( dict(params=layer.parameters(), weight_decay=weight_decay)) layers.append(layer) inc = hidden acts.append(get_activation(activation)) layer = Linear(inc, out_channels, bias=use_bias) layers.append(layer) paras.append(dict(params=layer.parameters(), weight_decay=weight_decay)) self.layers = layers self.acts = acts self.dropout = Dropout(dropout) self.compile(loss=torch.nn.CrossEntropyLoss(), optimizer=optim.Adam(paras, lr=lr), metrics=[Accuracy()])
def example_experts(): expert = Linear(1, 1) opt = torch.optim.SGD(expert.parameters(), PEAK_LR) args_schema = (BatchTensorDescriptor(1),) expert_backend = ExpertBackend(name=EXPERT_NAME, expert=expert, optimizer=opt, scheduler=get_linear_schedule_with_warmup, num_warmup_steps=BACKWARD_PASSES_BEFORE_SAVE, num_total_steps=BACKWARD_PASSES_BEFORE_SAVE + BACKWARD_PASSES_AFTER_SAVE, args_schema=args_schema, outputs_schema=BatchTensorDescriptor(1), max_batch_size=1, ) experts = {EXPERT_NAME: expert_backend} yield experts