def test_lsh_lm_model_grad(self): config = self._get_basic_config_and_input() config["attn_layers"] = ["lsh", "lsh", "lsh", "lsh"] config["hidden_dropout_prob"] = 0.0 config["lsh_attention_probs_dropout_prob"] = 0.0 config["num_buckets"] = [2, 4] config["num_hashes"] = 6 torch.manual_seed(0) model = ReformerModelWithLMHead(ReformerConfig(**config)).to(torch_device) model.train() model.zero_grad() input_ids, _ = self._get_input_ids_and_mask() loss = model(input_ids=input_ids, labels=input_ids)[0] self.assertTrue(torch.allclose(loss, torch.tensor(5.7819, dtype=torch.float, device=torch_device), atol=1e-3)) loss.backward() # check last grads to cover all proable errors grad_slice_word = model.reformer.embeddings.word_embeddings.weight.grad[0, :5] expected_grad_slice_word = torch.tensor( [2.6357e-05, 4.3358e-04, -8.4985e-04, 1.0094e-04, 3.8954e-04], dtype=torch.float, device=torch_device, ) grad_slice_position_factor_1 = model.reformer.embeddings.position_embeddings.weights[0][1, 0, -5:] expected_grad_slice_pos_fac_1 = torch.tensor( [-0.0984, 0.6283, 0.4282, 1.2960, 0.6897], dtype=torch.float, device=torch_device, ) grad_slice_position_factor_2 = model.reformer.embeddings.position_embeddings.weights[1][0, 1, :5] expected_grad_slice_pos_fac_2 = torch.tensor( [0.4626, -0.0231, -0.0172, 0.1081, 0.3805], dtype=torch.float, device=torch_device, ) self.assertTrue(torch.allclose(grad_slice_word, expected_grad_slice_word, atol=1e-3)) self.assertTrue(torch.allclose(grad_slice_position_factor_1, expected_grad_slice_pos_fac_1, atol=1e-3)) self.assertTrue(torch.allclose(grad_slice_position_factor_2, expected_grad_slice_pos_fac_2, atol=1e-3))
def test_local_lm_model_grad(self): config = self._get_basic_config_and_input() config["attn_layers"] = ["local", "local", "local", "local"] config["hidden_dropout_prob"] = 0.0 config["local_attention_probs_dropout_prob"] = 0.0 torch.manual_seed(0) model = ReformerModelWithLMHead(ReformerConfig(**config)).to(torch_device) model.train() model.zero_grad() input_ids, _ = self._get_input_ids_and_mask() loss = model(input_ids=input_ids, labels=input_ids)[0] self.assertTrue(torch.allclose(loss, torch.tensor(5.7786, dtype=torch.float, device=torch_device), atol=1e-3)) loss.backward() # check last grads to cover all proable errors grad_slice_word = model.reformer.embeddings.word_embeddings.weight.grad[0, :5] expected_grad_slice_word = torch.tensor( [-0.0005, 0.0001, 0.0002, 0.0003, 0.0006], dtype=torch.float, device=torch_device, ) grad_slice_position_factor_1 = model.reformer.embeddings.position_embeddings.weights[0][1, 0, -5:] expected_grad_slice_pos_fac_1 = torch.tensor( [0.0037, -1.3793, -1.0231, -1.5230, -2.5306], dtype=torch.float, device=torch_device, ) grad_slice_position_factor_2 = model.reformer.embeddings.position_embeddings.weights[1][0, 1, :5] expected_grad_slice_pos_fac_2 = torch.tensor( [-1.3165, 0.5168, 0.7785, 1.0811, -0.9830], dtype=torch.float, device=torch_device, ) self.assertTrue(torch.allclose(grad_slice_word, expected_grad_slice_word, atol=1e-3)) self.assertTrue(torch.allclose(grad_slice_position_factor_1, expected_grad_slice_pos_fac_1, atol=1e-3)) self.assertTrue(torch.allclose(grad_slice_position_factor_2, expected_grad_slice_pos_fac_2, atol=1e-3))
def create_and_check_reformer_feed_backward_chunking( self, config, input_ids, input_mask, choice_labels): if not self.is_training: return # disable dropout config.hidden_dropout_prob = 0 config.local_attention_probs_dropout_prob = 0 config.lsh_attention_probs_dropout_prob = 0 torch.manual_seed(0) model = ReformerModelWithLMHead(config=config) model.to(torch_device) model.train() model.zero_grad() loss_no_chunk, output_no_chunk = model(input_ids, labels=input_ids, attention_mask=input_mask)[:2] loss_no_chunk.backward() grad_slice_word_no_chunk = model.reformer.embeddings.word_embeddings.weight.grad[ 0, :5] grad_slice_position_factor_1_no_chunk = model.reformer.embeddings.position_embeddings.weights[ 0][1, 0, -5:] grad_slice_position_factor_2_no_chunk = model.reformer.embeddings.position_embeddings.weights[ 1][0, 1, :5] config.chunk_size_lm_head = 1 config.chunk_size_feed_forward = 1 torch.manual_seed(0) model = ReformerModelWithLMHead(config=config) model.to(torch_device) model.train() model.zero_grad() loss_chunk, output_chunk = model(input_ids, labels=input_ids, attention_mask=input_mask)[:2] loss_chunk.backward() grad_slice_word_chunk = model.reformer.embeddings.word_embeddings.weight.grad[ 0, :5] grad_slice_position_factor_1_chunk = model.reformer.embeddings.position_embeddings.weights[ 0][1, 0, -5:] grad_slice_position_factor_2_chunk = model.reformer.embeddings.position_embeddings.weights[ 1][0, 1, :5] self.parent.assertTrue( torch.allclose(loss_chunk, loss_no_chunk, atol=1e-3)) self.parent.assertTrue( torch.allclose(grad_slice_word_no_chunk, grad_slice_word_chunk, atol=1e-3)) self.parent.assertTrue( torch.allclose(grad_slice_position_factor_1_chunk, grad_slice_position_factor_1_no_chunk, atol=1e-3)) self.parent.assertTrue( torch.allclose(grad_slice_position_factor_2_chunk, grad_slice_position_factor_2_no_chunk, atol=1e-3))