def test_elmo_contextualizer_without_grad_frozen_scalar_mix(self):
        weights_path = self.model_paths / "lm_weights.hdf5"
        options_path = self.model_paths / "options.json"

        params = Params({
            "type": "elmo_contextualizer",
            "batch_size": 2,
            "layer_num": 1,
            "freeze_scalar_mix": True,
            "elmo": {
                "options_file": options_path,
                "weight_file": weights_path,
                "dropout": 0.0,
                "num_output_representations": 1,
                "requires_grad": False,
            }
        })
        elmo_contextualizer = Contextualizer.from_params(params)
        unpadded_representations = elmo_contextualizer([
            self.sentence_1, self.sentence_2, self.sentence_3])
        token_representations, mask = pad_contextualizer_output(
            unpadded_representations)
        loss = token_representations.sum()
        # Nothing in the contextualizer is requires_grad=True, so this
        # should be requires_grad=False and grad_fn should be None
        assert loss.grad_fn is None
        assert loss.requires_grad is False
    def test_elmo_contextualizer_with_grad(self):
        weights_path = self.model_paths / "lm_weights.hdf5"
        options_path = self.model_paths / "options.json"

        params = Params({
            "type": "elmo_contextualizer",
            "batch_size": 2,
            "elmo": {
                "options_file": options_path,
                "weight_file": weights_path,
                "dropout": 0.0,
                "num_output_representations": 1,
                "requires_grad": True,
            }
        })
        elmo_contextualizer = Contextualizer.from_params(params)
        unpadded_representations = elmo_contextualizer([
            self.sentence_1, self.sentence_2, self.sentence_3])
        token_representations, mask = pad_contextualizer_output(
            unpadded_representations)
        loss = token_representations.sum()
        loss.backward()
        elmo_grads = [param.grad for name, param in
                      elmo_contextualizer.named_parameters() if '_elmo_lstm' in name]
        assert all([grad is not None for grad in elmo_grads])
    def test_elmo_contextualizer_with_grad_frozen_scalar_mix(self):
        weights_path = self.model_paths / "lm_weights.hdf5"
        options_path = self.model_paths / "options.json"

        params = Params({
            "type": "elmo_contextualizer",
            "batch_size": 2,
            "layer_num": 1,
            "freeze_scalar_mix": True,
            "elmo": {
                "options_file": options_path,
                "weight_file": weights_path,
                "dropout": 0.0,
                "num_output_representations": 1,
                "requires_grad": True,
            }
        })
        elmo_contextualizer = Contextualizer.from_params(params)
        unpadded_representations = elmo_contextualizer([
            self.sentence_1, self.sentence_2, self.sentence_3])
        token_representations, mask = pad_contextualizer_output(
            unpadded_representations)
        loss = token_representations.sum()
        loss.backward()
        for name, param in elmo_contextualizer.named_parameters():
            if "scalar_mix" in name:
                assert param.grad is None, "Parameter {} should not have grad.".format(name)
            else:
                assert param.grad is not None, "Parameter {} should have grad.".format(name)
예제 #4
0
 def test_pad_contextualizer_output(self):
     contextualizer_output = [
         torch.Tensor([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]]),
         torch.Tensor([[0.1, 0.2], [0.3, 0.4]]),
         torch.Tensor([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6], [0.7, 0.8]]),
         torch.Tensor([[0.1, 0.2]])
     ]
     padded_output, mask = pad_contextualizer_output(contextualizer_output)
     assert_allclose(
         padded_output.cpu().numpy(),
         np.array([[[0.1, 0.2], [0.3, 0.4], [0.5, 0.6], [0, 0]],
                   [[0.1, 0.2], [0.3, 0.4], [0, 0], [0, 0]],
                   [[0.1, 0.2], [0.3, 0.4], [0.5, 0.6], [0.7, 0.8]],
                   [[0.1, 0.2], [0, 0], [0, 0], [0, 0]]]))
     assert_allclose(
         mask.cpu().numpy(),
         np.array([[1, 1, 1, 0], [1, 1, 0, 0], [1, 1, 1, 1], [1, 0, 0, 0]]))
예제 #5
0
 def test_glove_contextualizer_frozen(self):
     params = Params({
         "type": "glove_contextualizer",
         "glove_path": self.glove_path,
         "embedding_dim": self.representation_dim,
         "trainable": False
     })
     glove_contextualizer = Contextualizer.from_params(params)
     unpadded_representations = glove_contextualizer(
         [self.sentence_1, self.sentence_2, self.sentence_3])
     token_representations, mask = pad_contextualizer_output(
         unpadded_representations)
     loss = token_representations.sum()
     # Nothing in the contextualizer is requires_grad=True, so this
     # should be requires_grad=False and grad_fn should be None
     assert loss.grad_fn is None
     assert loss.requires_grad is False
예제 #6
0
 def test_glove_contextualizer_trainable(self):
     params = Params({
         "type": "glove_contextualizer",
         "glove_path": self.glove_path,
         "embedding_dim": self.representation_dim,
         "trainable": True
     })
     glove_contextualizer = Contextualizer.from_params(params)
     unpadded_representations = glove_contextualizer(
         [self.sentence_1, self.sentence_2, self.sentence_3])
     token_representations, mask = pad_contextualizer_output(
         unpadded_representations)
     loss = token_representations.sum()
     loss.backward()
     glove_grads = [
         param.grad
         for name, param in glove_contextualizer.named_parameters()
     ]
     assert all([grad is not None for grad in glove_grads])
    def forward(
            self,  # type: ignore
            label_indices: torch.LongTensor,
            token_representations: torch.FloatTensor = None,
            raw_tokens: List[List[str]] = None,
            labels: torch.LongTensor = None,
            **kwargs) -> Dict[str, torch.Tensor]:
        """
        If ``token_representations`` is provided, ``tokens`` is not required. If
        ``token_representations`` is ``None``, then ``tokens`` is required.

        Parameters
        ----------
        label_indices : torch.LongTensor
            A LongTensor of shape (batch_size, max_num_adpositions) with the tokens
            to predict a label for for each element (sentence) in the batch.
        token_representations : torch.FloatTensor, optional (default = None)
            A tensor of shape (batch_size, sequence_length, representation_dim) with
            the represenatation of the first token. If None, we use a contextualizer
            within this model to produce the token representation.
        raw_tokens : List[List[str]], optional (default = None)
            A batch of lists with the raw token strings. Used to compute
            token_representations, if either are None.
        labels : torch.LongTensor, optional (default = None)
            A torch tensor representing the sequence of integer gold class labels
            of shape ``(batch_size, num_label_indices)``.

        Returns
        -------
        An output dictionary consisting of:
        logits : torch.FloatTensor
            A tensor of shape ``(batch_size, num_label_indices,
            num_classes)`` representing unnormalized log probabilities
            of the classes.
        class_probabilities : torch.FloatTensor
            A tensor of shape ``(batch_size, num_label_indices,
            num_classes)`` representing a distribution of the tag classes.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimized.
        """
        # Convert to LongTensor
        # TODO: add PR to ArrayField to preserve array types.
        label_indices = label_indices.long()
        if token_representations is None:
            if self._contextualizer is None:
                raise ConfigurationError(
                    "token_representation not provided as input to the model, and no "
                    "contextualizer was specified. Either add a contextualizer to your "
                    "dataset reader (preferred if your contextualizer is frozen) or to "
                    "this model (if you wish to train your contextualizer).")
            if raw_tokens is None:
                raise ValueError(
                    "Input raw_tokens is ``None`` --- make sure to set "
                    "include_raw_tokens in the DatasetReader to True.")
            if label_indices is None:
                raise ValueError("Did not recieve any token indices, needed "
                                 "if the contextualizer is within the model.")
            # Convert contextualizer output into a tensor
            # Shape: (batch_size, max_seq_len, representation_dim)
            token_representations, _ = pad_contextualizer_output(
                self._contextualizer(raw_tokens))

        # Move token representation to the same device as the
        # module (CPU or CUDA). TODO(nfliu): This only works if the module
        # is on one device.
        device = next(self._decoder._linear_layers[0].parameters()).device
        token_representations = token_representations.to(device)
        text_mask = get_text_mask_from_representations(token_representations)
        text_mask = text_mask.to(device)
        label_mask = self._get_label_mask_from_label_indices(label_indices)
        label_mask = label_mask.to(device)

        # Mask out the -1 padding in the label_indices, since that doesn't
        # work with indexing. Note that we can't 0 pad because 0 is actually
        # a valid label index, so we pad with -1 just for the purposes of
        # proper mask calculation and then convert to 0-padding by applying
        # the mask.
        label_indices = label_indices * label_mask

        # Encode the token representation.
        encoded_token_representations = self._encoder(token_representations,
                                                      text_mask)

        batch_size = label_indices.size(0)
        # Index into the encoded_token_representations to get tensors corresponding
        # to the representations of the tokens to predict labels for.
        # Shape: (batch_size, num_label_indices, representation_dim)
        range_vector = get_range_vector(
            batch_size, get_device_of(label_indices)).unsqueeze(1)
        selected_token_representations = encoded_token_representations[
            range_vector, label_indices]
        selected_token_representations = selected_token_representations.contiguous(
        )

        # Decode out a label from the token representation
        # Shape: (batch_size, num_label_indices, num_classes)
        logits = self._decoder(selected_token_representations)
        class_probabilities = F.softmax(logits, dim=-1)
        output_dict = {
            "logits": logits,
            "class_probabilities": class_probabilities
        }
        if labels is not None:
            loss = sequence_cross_entropy_with_logits(
                logits, labels, label_mask, average=self.loss_average)
            for name, metric in self.metrics.items():
                # When not running in error analysis mode, skip
                # metrics that start with "_"
                if not self.error_analysis and name.startswith("_"):
                    continue
                metric(logits, labels, label_mask.float())
            output_dict["loss"] = loss
        return output_dict
예제 #8
0
    def forward(self,  # type: ignore
                token_representations: torch.FloatTensor = None,
                raw_tokens: List[List[str]] = None,
                labels: torch.LongTensor = None,
                **kwargs) -> Dict[str, torch.Tensor]:
        """
        Parameters
        ----------
        token_representations : torch.FloatTensor, optional (default = None)
            A padded tensor of shape (batch_size, seq_len, representation_dim),
            with the represenatations of the tokens. If None, we use a contextualizer within
            this model to produce the token representation.
        raw_tokens : List[List[str]], optional (default = None)
            A batch of lists with the raw token strings. Used to compute token_representations
            if it is None.
        labels : torch.LongTensor, optional (default = None)
            A torch tensor representing the sequence of integer gold class labels of shape
            ``(batch_size, num_tokens)``.

        Retpurns
        -------
        An output dictionary consisting of:
        logits : torch.FloatTensor
            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
            unnormalised log probabilities of the tag classes.
        class_probabilities : torch.FloatTensor
            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
            a distribution of the tag classes per word.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.
        """
        if token_representations is None:
            if self._contextualizer is None:
                raise ConfigurationError(
                    "token_representations not provided as input to the model, and no "
                    "contextualizer was specified. Either add a contextualizer to your "
                    "dataset reader (preferred if your contextualizer is frozen) or to "
                    "this model (if you wish to train your contextualizer).")
            if raw_tokens is None:
                raise ValueError("Input raw_tokens is ``None`` and token representations "
                                 "were not provided!")
            token_representations, mask = pad_contextualizer_output(
                self._contextualizer(raw_tokens))
            # Move token representations to the same device as the
            # module (CPU or CUDA). TODO(nfliu): This only works if the module
            # is on one device.
            device = next(self._decoder._module._linear_layers[0].parameters()).device
            token_representations = token_representations.to(device)
            mask = mask.to(device)
        else:
            mask = get_text_mask_from_representations(token_representations)

        batch_size, sequence_length = mask.size()

        # Encode the token representations.
        encoded_token_representations = self._encoder(token_representations, mask)

        logits = self._decoder(encoded_token_representations)

        output_dict = {}
        # Run CRF if provided and calculate class_probabilities
        if self._crf:
            best_paths = self._crf.viterbi_tags(logits, mask)
            # Just get the tags and ignore the score.
            predicted_tags = [x for x, y in best_paths]
            # Add tags to output dict
            output_dict["tags"] = predicted_tags
            # Get the class probabilities from the viterbi tags
            class_probabilities = logits * 0.
            for i, instance_tags in enumerate(predicted_tags):
                for j, tag_id in enumerate(instance_tags):
                    class_probabilities[i, j, tag_id] = 1
        else:
            reshaped_log_probs = logits.view(-1, self._num_classes)
            class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view(
                [batch_size, sequence_length, self._num_classes])

        output_dict["logits"] = logits
        output_dict["mask"] = mask
        output_dict["class_probabilities"] = class_probabilities

        if labels is not None:
            if self._crf:
                # Add negative log-likelihood as loss
                log_likelihood = self._crf(logits, labels, mask)
                loss = -log_likelihood
            else:
                loss = sequence_cross_entropy_with_logits(logits, labels, mask,
                                                          average=self.loss_average)

            for name, metric in self.metrics.items():
                # When not running in error analysis mode, skip
                # metrics that start with "_"
                if not self.error_analysis and name.startswith("_"):
                    continue
                if name == "perplexity":
                    # Perplexity metric API is a bit different from the others.
                    metric(loss, mask.float().sum())
                else:
                    metric(class_probabilities, labels, mask.float())
            output_dict["loss"] = loss
        return output_dict