Exemplo n.º 1
0
    def test_layer_same_as_params(self):
        params = copy.deepcopy(self.params_dict)
        num_hidden_layers = params.pop("num_hidden_layers")
        # params = Params(params)

        torch.manual_seed(1234)
        transformer_layer = TransformerLayer(**params)
        transformer_stack_from_layer = TransformerStack(
            num_hidden_layers, transformer_layer)
        torch.manual_seed(1234)
        transformer_stack_from_params = TransformerStack(
            num_hidden_layers, **params)

        hidden_states = torch.randn(2, 3, 6)
        attention_mask = torch.tensor([[0, 1, 0], [1, 1, 0]])

        transformer_stack_from_layer.eval()
        transformer_stack_from_params.eval()

        torch.manual_seed(1234)
        layer_output = transformer_stack_from_layer.forward(
            hidden_states, attention_mask=attention_mask)

        torch.manual_seed(1234)
        params_output = transformer_stack_from_params.forward(
            hidden_states, attention_mask=attention_mask)

        assert torch.allclose(layer_output[0], params_output[0])
Exemplo n.º 2
0
def test_layer_from_pretrained(pretrained_name, relevant_top_level_module):
    torch.manual_seed(1234)
    pretrained = cached_transformers.get(pretrained_name, False).eval()

    if "distilbert" in pretrained_name:
        encoder = pretrained.transformer
    else:
        encoder = pretrained.encoder
    # Hacky way to get a bert layer.
    pretrained_module = list(encoder.layer.modules())[1]

    torch.manual_seed(1234)
    module = TransformerLayer.from_pretrained_module(
        pretrained_name,
        relevant_module=None
        if relevant_top_level_module is None
        else f"{relevant_top_level_module}.encoder.layer.0",
    ).eval()

    batch_size = 2
    seq_length = 15
    hidden_size = module.attention.self.query.in_features

    hidden_states = torch.randn(batch_size, seq_length, hidden_size)
    attention_mask = torch.randint(0, 2, (batch_size, seq_length))
    attention_mask_hf = attention_mask[:, None, None, :]
    attention_mask_hf = (1.0 - attention_mask_hf) * -10e5

    torch.manual_seed(1234)
    output = module(hidden_states, attention_mask=attention_mask.squeeze()).hidden_states

    torch.manual_seed(1234)
    hf_output = pretrained_module(hidden_states, attention_mask=attention_mask_hf)[0]

    assert torch.allclose(output, hf_output, atol=1e-04)
Exemplo n.º 3
0
    def test_cross_attention(self):
        params = copy.deepcopy(self.params_dict)
        params["add_cross_attention"] = True

        params = Params(params)

        transformer_layer = TransformerLayer.from_params(params)
        assert hasattr(transformer_layer, "cross_attention")

        attention_mask = torch.tensor([[0, 1, 0], [1, 1, 0]])
        transformer_layer.forward(
            torch.randn(2, 3, 6),
            attention_mask=attention_mask,
            encoder_hidden_states=torch.randn(2, 3, 6),
        )

        transformer_layer_new = TransformerLayer.from_pretrained_module(
            transformer_layer, source="allennlp")

        assert hasattr(transformer_layer_new, "cross_attention")
Exemplo n.º 4
0
def test_layer_with_cross_attention(layer_params):
    layer_params["add_cross_attention"] = True

    transformer_layer = TransformerLayer.from_params(layer_params).eval()
    assert hasattr(transformer_layer, "cross_attention")

    attention_mask = torch.tensor([[0, 1, 0], [1, 1, 0]])
    transformer_layer(
        torch.randn(2, 3, 6),
        attention_mask=attention_mask,
        encoder_hidden_states=torch.randn(2, 3, 6),
    )
Exemplo n.º 5
0
    def test_loading_from_pretrained_weights(self):

        # Hacky way to get a bert layer.
        for i, pretrained_module in enumerate(
                self.pretrained.encoder.layer.modules()):
            if i == 1:
                break

        module = TransformerLayer.from_pretrained_module(pretrained_module)
        mapping = {
            val: key
            for key, val in module._construct_default_mapping(
                pretrained_module, "huggingface", {}).items()
        }
        assert_equal_parameters(pretrained_module, module, mapping=mapping)
Exemplo n.º 6
0
def test_layer_matches_huggingface(layer_params, module_name, hf_module):
    layer = TransformerLayer.from_params(layer_params).eval()
    state_dict = layer._get_mapped_state_dict(hf_module.state_dict())
    layer.load_state_dict(state_dict)

    hidden_states = torch.randn(2, 3, 6)
    attention_mask = torch.tensor([[0, 1, 0], [1, 1, 0]])

    torch.manual_seed(1234)
    output = layer(hidden_states, attention_mask=attention_mask)
    # We do this because bert, roberta, electra process the attention_mask at the model level.
    attention_mask_hf = (attention_mask == 0).view((2, 1, 1, 3)).expand(2, 2, 3, 3) * -10e5
    torch.manual_seed(1234)
    hf_output = hf_module(hidden_states, attention_mask=attention_mask_hf)

    assert torch.allclose(output.hidden_states, hf_output[0])
Exemplo n.º 7
0
    def test_forward_against_huggingface_outputs(self, module_name, hf_module):
        hidden_states = torch.randn(2, 3, 6)
        attention_mask = torch.tensor([[0, 1, 0], [1, 1, 0]])

        layer = TransformerLayer.from_pretrained_module(hf_module)

        torch.manual_seed(1234)
        output = layer.forward(hidden_states, attention_mask=attention_mask)
        # We do this because bert, roberta, electra process the attention_mask at the model level.
        attention_mask_hf = (attention_mask == 0).view(
            (2, 1, 1, 3)).expand(2, 2, 3, 3) * -10e5
        torch.manual_seed(1234)
        hf_output = hf_module.forward(hidden_states,
                                      attention_mask=attention_mask_hf)

        assert torch.allclose(output[0], hf_output[0])
Exemplo n.º 8
0
    def test_loading_from_pretrained_weights_using_model_name(
            self, pretrained_name):

        torch.manual_seed(1234)
        pretrained = cached_transformers.get(pretrained_name, False)

        if "distilbert" in pretrained_name:
            encoder = pretrained.transformer
        else:
            encoder = pretrained.encoder
        # Hacky way to get a bert layer.
        for i, pretrained_module in enumerate(encoder.layer.modules()):
            if i == 1:
                break

        pretrained_module = pretrained_module

        torch.manual_seed(1234)
        module = TransformerLayer.from_pretrained_module(pretrained_name)
        mapping = {
            val: key
            for key, val in module._construct_default_mapping(
                pretrained_module, "huggingface", {}).items()
        }
        assert_equal_parameters(pretrained_module, module, mapping=mapping)

        batch_size = 2
        seq_len = 768
        dim = module.attention.self.query.in_features
        hidden_states = torch.randn(batch_size, seq_len, dim)
        attention_mask = torch.randint(0, 2, (batch_size, seq_len))
        mask_reshp = (batch_size, 1, 1, dim)
        attention_mask_hf = (attention_mask == 0).view(mask_reshp).expand(
            batch_size, 12, seq_len, seq_len) * -10e5

        # setting to eval mode to avoid non-deterministic dropout.
        module = module.eval()
        pretrained_module = pretrained_module.eval()

        torch.manual_seed(1234)
        output = module.forward(hidden_states,
                                attention_mask=attention_mask.squeeze())[0]
        torch.manual_seed(1234)
        hf_output = pretrained_module.forward(
            hidden_states, attention_mask=attention_mask_hf)[0]

        assert torch.allclose(output, hf_output, atol=1e-04)
Exemplo n.º 9
0
def test_layer(layer_params):
    transformer_layer = TransformerLayer.from_params(layer_params.duplicate()).eval()

    assert (
        transformer_layer.attention.self.num_attention_heads == layer_params["num_attention_heads"]
    )
    assert transformer_layer.attention.self.attention_head_size == int(
        layer_params["hidden_size"] / layer_params["num_attention_heads"]
    )
    assert (
        transformer_layer.attention.self.all_head_size
        == layer_params["num_attention_heads"]
        * transformer_layer.attention.self.attention_head_size
    )
    assert transformer_layer.attention.self.query.in_features == layer_params["hidden_size"]
    assert transformer_layer.attention.self.key.in_features == layer_params["hidden_size"]
    assert transformer_layer.attention.self.value.in_features == layer_params["hidden_size"]
    assert transformer_layer.attention.self.dropout == layer_params["attention_dropout"]

    assert transformer_layer.attention.output.dense.in_features == layer_params["hidden_size"]
    assert transformer_layer.attention.output.dense.out_features == layer_params["hidden_size"]
    assert (
        transformer_layer.attention.output.layer_norm.normalized_shape[0]
        == layer_params["hidden_size"]
    )
    assert transformer_layer.attention.output.dropout.p == layer_params["hidden_dropout"]

    assert transformer_layer.intermediate.dense.in_features == layer_params["hidden_size"]
    assert transformer_layer.intermediate.dense.out_features == layer_params["intermediate_size"]

    assert transformer_layer.output.dense.in_features == layer_params["intermediate_size"]
    assert transformer_layer.output.dense.out_features == layer_params["hidden_size"]

    assert transformer_layer.output.layer_norm.normalized_shape[0] == layer_params["hidden_size"]

    assert transformer_layer.output.dropout.p == layer_params["hidden_dropout"]

    attention_mask = torch.tensor([[0, 1, 0], [1, 1, 0]])
    transformer_layer(torch.randn(2, 3, 6), attention_mask=attention_mask)

    with pytest.raises(AssertionError):
        transformer_layer(
            torch.randn(2, 3, 6),
            attention_mask=attention_mask,
            encoder_hidden_states=torch.randn(2, 3, 6),
        )
Exemplo n.º 10
0
    def setup_method(self):
        super().setup_method()

        self.params_dict = {
            "hidden_size": 6,
            "intermediate_size": 3,
            "num_attention_heads": 2,
            "attention_dropout": 0.1,
            "hidden_dropout": 0.2,
            "activation": "relu",
        }

        params = Params(copy.deepcopy(self.params_dict))

        self.transformer_layer = TransformerLayer.from_params(params)
        self.pretrained_name = "bert-base-uncased"

        self.pretrained = cached_transformers.get(self.pretrained_name, False)
def test_transformer_stack_from_params(params):
    torch.manual_seed(SEED)
    transformer_stack = TransformerStack.from_params(params)

    # Make sure we have the right number of modules.
    modules = dict(transformer_stack.named_modules())
    assert len(modules["layers"]) == PARAMS_DICT["num_hidden_layers"]

    hidden_states = torch.randn(2, 3, 6)
    attention_mask = torch.tensor([[0, 1, 0], [1, 1, 0]])

    # Make sure forward pass can run.
    torch.manual_seed(SEED)
    output = transformer_stack.forward(hidden_states,
                                       attention_mask=attention_mask)

    # Make sure we get the same results when instantiating from a single layer.
    torch.manual_seed(SEED)
    layer_params = copy.deepcopy(PARAMS_DICT)
    num_hidden_layers = layer_params.pop("num_hidden_layers")
    transformer_layer = TransformerLayer(
        **layer_params)  # type: ignore[arg-type]
    transformer_stack_from_layer = TransformerStack(
        num_hidden_layers,
        transformer_layer  # type: ignore[arg-type]
    )

    torch.manual_seed(SEED)
    from_layer_output = transformer_stack_from_layer.forward(
        hidden_states, attention_mask=attention_mask)

    assert torch.allclose(from_layer_output.final_hidden_states,
                          output.final_hidden_states)

    # Make sure forward pass raises with bad input.
    with pytest.raises(AssertionError):
        transformer_stack.forward(
            torch.randn(2, 3, 6),
            attention_mask=torch.randn(2, 3),
            encoder_hidden_states=torch.randn(2, 3, 6),
        )
Exemplo n.º 12
0
def _load_pretrained(global_rank, world_size, gpu_id):
    TransformerLayer.from_pretrained_module(
        "epwalsh/bert-xsmall-dummy",
    )